diff options
Diffstat (limited to 'fs')
60 files changed, 1166 insertions, 2313 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index d8207a1b8c44..a6313a969bc5 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -43,7 +43,7 @@ source "fs/f2fs/Kconfig" source "fs/zonefs/Kconfig" config FS_DAX - bool "Direct Access (DAX) support" + bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) @@ -53,8 +53,23 @@ config FS_DAX Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, then you can avoid using the pagecache to buffer I/Os. Turning - on this option will compile in support for DAX; you will need to - mount the filesystem using the -o dax option. + on this option will compile in support for DAX. + + For a DAX device to support file system access it needs to have + struct pages. For the nfit based NVDIMMs this can be enabled + using the ndctl utility: + + # ndctl create-namespace --force --reconfig=namespace0.0 \ + --mode=fsdax --map=mem + + See the 'create-namespace' man page for details on the overhead of + --map=mem: + https://docs.pmem.io/ndctl-user-guide/ndctl-man-pages/ndctl-create-namespace + + For ndctl to work CONFIG_DEV_DAX needs to be enabled as well. For most + file systems DAX support needs to be manually enabled globally or + per-inode using a mount option as well. See the file documentation in + Documentation/filesystems/dax.rst for details. If you do not have a block device that is capable of using this, or if unsure, say N. Saying Y will increase the size of the kernel @@ -219,8 +234,7 @@ config ARCH_SUPPORTS_HUGETLBFS config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ - ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -353,7 +367,7 @@ source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" source "fs/ksmbd/Kconfig" -config CIFS_COMMON +config SMBFS_COMMON tristate default y if CIFS=y default m if CIFS=m diff --git a/fs/Makefile b/fs/Makefile index 2f21300851ae..84c5e4cdfee5 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ kernel_read_file.o remap_range.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o block_dev.o direct-io.o mpage.o +obj-y += buffer.o direct-io.o mpage.o else obj-y += no-block.o endif @@ -96,7 +96,7 @@ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-$(CONFIG_UNICODE) += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ -obj-$(CONFIG_CIFS_COMMON) += cifs_common/ +obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/ obj-$(CONFIG_CIFS) += cifs/ obj-$(CONFIG_SMB_SERVER) += ksmbd/ obj-$(CONFIG_HPFS_FS) += hpfs/ diff --git a/fs/attr.c b/fs/attr.c index 87ef39db1c34..473d21b3a86d 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -249,6 +249,34 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, } EXPORT_SYMBOL(setattr_copy); +int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, + unsigned int ia_valid) +{ + int error; + + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(mnt_userns, inode)) { + error = inode_permission(mnt_userns, inode, MAY_WRITE); + if (error) + return error; + } + } + return 0; +} +EXPORT_SYMBOL(may_setattr); + /** * notify_change - modify attributes of a filesytem object * @mnt_userns: user namespace of the mount the inode was found from @@ -290,25 +318,9 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, WARN_ON_ONCE(!inode_is_locked(inode)); - if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - } - - /* - * If utimes(2) and friends are called with times == NULL (or both - * times are UTIME_NOW), then we need to check for write permission - */ - if (ia_valid & ATTR_TOUCH) { - if (IS_IMMUTABLE(inode)) - return -EPERM; - - if (!inode_owner_or_capable(mnt_userns, inode)) { - error = inode_permission(mnt_userns, inode, MAY_WRITE); - if (error) - return error; - } - } + error = may_setattr(mnt_userns, inode, ia_valid); + if (error) + return error; if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; diff --git a/fs/block_dev.c b/fs/block_dev.c deleted file mode 100644 index 45df6cbccf12..000000000000 --- a/fs/block_dev.c +++ /dev/null @@ -1,1695 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2001 Andrea Arcangeli <[email protected]> SuSE - * Copyright (C) 2016 - 2020 Christoph Hellwig - */ - -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/fcntl.h> -#include <linux/slab.h> -#include <linux/kmod.h> -#include <linux/major.h> -#include <linux/device_cgroup.h> -#include <linux/highmem.h> -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/module.h> -#include <linux/blkpg.h> -#include <linux/magic.h> -#include <linux/buffer_head.h> -#include <linux/swap.h> -#include <linux/pagevec.h> -#include <linux/writeback.h> -#include <linux/mpage.h> -#include <linux/mount.h> -#include <linux/pseudo_fs.h> -#include <linux/uio.h> -#include <linux/namei.h> -#include <linux/log2.h> -#include <linux/cleancache.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/falloc.h> -#include <linux/part_stat.h> -#include <linux/uaccess.h> -#include <linux/suspend.h> -#include "internal.h" -#include "../block/blk.h" - -struct bdev_inode { - struct block_device bdev; - struct inode vfs_inode; -}; - -static const struct address_space_operations def_blk_aops; - -static inline struct bdev_inode *BDEV_I(struct inode *inode) -{ - return container_of(inode, struct bdev_inode, vfs_inode); -} - -struct block_device *I_BDEV(struct inode *inode) -{ - return &BDEV_I(inode)->bdev; -} -EXPORT_SYMBOL(I_BDEV); - -static void bdev_write_inode(struct block_device *bdev) -{ - struct inode *inode = bdev->bd_inode; - int ret; - - spin_lock(&inode->i_lock); - while (inode->i_state & I_DIRTY) { - spin_unlock(&inode->i_lock); - ret = write_inode_now(inode, true); - if (ret) { - char name[BDEVNAME_SIZE]; - pr_warn_ratelimited("VFS: Dirty inode writeback failed " - "for block device %s (err=%d).\n", - bdevname(bdev, name), ret); - } - spin_lock(&inode->i_lock); - } - spin_unlock(&inode->i_lock); -} - -/* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - - if (mapping_empty(mapping)) - return; - - invalidate_bh_lrus(); - truncate_inode_pages(mapping, 0); -} - -/* Invalidate clean unused buffers and pagecache. */ -void invalidate_bdev(struct block_device *bdev) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - - if (mapping->nrpages) { - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); - } - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_invalidate_inode(mapping); -} -EXPORT_SYMBOL(invalidate_bdev); - -/* - * Drop all buffers & page cache for given bdev range. This function bails - * with error if bdev has other exclusive owner (such as filesystem). - */ -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, - loff_t lstart, loff_t lend) -{ - /* - * If we don't hold exclusive handle for the device, upgrade to it - * while we discard the buffer cache to avoid discarding buffers - * under live filesystem. - */ - if (!(mode & FMODE_EXCL)) { - int err = bd_prepare_to_claim(bdev, truncate_bdev_range); - if (err) - goto invalidate; - } - - truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); - if (!(mode & FMODE_EXCL)) - bd_abort_claiming(bdev, truncate_bdev_range); - return 0; - -invalidate: - /* - * Someone else has handle exclusively open. Try invalidating instead. - * The 'end' argument is inclusive so the rounding is safe. - */ - return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, - lstart >> PAGE_SHIFT, - lend >> PAGE_SHIFT); -} - -static void set_init_blocksize(struct block_device *bdev) -{ - unsigned int bsize = bdev_logical_block_size(bdev); - loff_t size = i_size_read(bdev->bd_inode); - - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_inode->i_blkbits = blksize_bits(bsize); -} - -int set_blocksize(struct block_device *bdev, int size) -{ - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) - return -EINVAL; - - /* Size cannot be smaller than the size supported by the device */ - if (size < bdev_logical_block_size(bdev)) - return -EINVAL; - - /* Don't change the size if it is same as current */ - if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { - sync_blockdev(bdev); - bdev->bd_inode->i_blkbits = blksize_bits(size); - kill_bdev(bdev); - } - return 0; -} - -EXPORT_SYMBOL(set_blocksize); - -int sb_set_blocksize(struct super_block *sb, int size) -{ - if (set_blocksize(sb->s_bdev, size)) - return 0; - /* If we get here, we know size is power of two - * and it's value is between 512 and PAGE_SIZE */ - sb->s_blocksize = size; - sb->s_blocksize_bits = blksize_bits(size); - return sb->s_blocksize; -} - -EXPORT_SYMBOL(sb_set_blocksize); - -int sb_min_blocksize(struct super_block *sb, int size) -{ - int minsize = bdev_logical_block_size(sb->s_bdev); - if (size < minsize) - size = minsize; - return sb_set_blocksize(sb, size); -} - -EXPORT_SYMBOL(sb_min_blocksize); - -static int -blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - -static struct inode *bdev_file_inode(struct file *file) -{ - return file->f_mapping->host; -} - -static unsigned int dio_bio_write_op(struct kiocb *iocb) -{ - unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - - /* avoid the need for a I/O completion work item */ - if (iocb->ki_flags & IOCB_DSYNC) - op |= REQ_FUA; - return op; -} - -#define DIO_INLINE_BIO_VECS 4 - -static void blkdev_bio_end_io_simple(struct bio *bio) -{ - struct task_struct *waiter = bio->bi_private; - - WRITE_ONCE(bio->bi_private, NULL); - blk_wake_io_task(waiter); -} - -static ssize_t -__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; - loff_t pos = iocb->ki_pos; - bool should_dirty = false; - struct bio bio; - ssize_t ret; - blk_qc_t qc; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - if (nr_pages <= DIO_INLINE_BIO_VECS) - vecs = inline_vecs; - else { - vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), - GFP_KERNEL); - if (!vecs) - return -ENOMEM; - } - - bio_init(&bio, vecs, nr_pages); - bio_set_dev(&bio, bdev); - bio.bi_iter.bi_sector = pos >> 9; - bio.bi_write_hint = iocb->ki_hint; - bio.bi_private = current; - bio.bi_end_io = blkdev_bio_end_io_simple; - bio.bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(&bio, iter); - if (unlikely(ret)) - goto out; - ret = bio.bi_iter.bi_size; - - if (iov_iter_rw(iter) == READ) { - bio.bi_opf = REQ_OP_READ; - if (iter_is_iovec(iter)) - should_dirty = true; - } else { - bio.bi_opf = dio_bio_write_op(iocb); - task_io_account_write(ret); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio.bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(&bio, iocb); - - qc = submit_bio(&bio); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) - break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - bio_release_pages(&bio, should_dirty); - if (unlikely(bio.bi_status)) - ret = blk_status_to_errno(bio.bi_status); - -out: - if (vecs != inline_vecs) - kfree(vecs); - - bio_uninit(&bio); - - return ret; -} - -struct blkdev_dio { - union { - struct kiocb *iocb; - struct task_struct *waiter; - }; - size_t size; - atomic_t ref; - bool multi_bio : 1; - bool should_dirty : 1; - bool is_sync : 1; - struct bio bio; -}; - -static struct bio_set blkdev_dio_pool; - -static int blkdev_iopoll(struct kiocb *kiocb, bool wait) -{ - struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); - struct request_queue *q = bdev_get_queue(bdev); - - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); -} - -static void blkdev_bio_end_io(struct bio *bio) -{ - struct blkdev_dio *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - - if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { - if (!dio->is_sync) { - struct kiocb *iocb = dio->iocb; - ssize_t ret; - - if (likely(!dio->bio.bi_status)) { - ret = dio->size; - iocb->ki_pos += ret; - } else { - ret = blk_status_to_errno(dio->bio.bi_status); - } - - dio->iocb->ki_complete(iocb, ret, 0); - if (dio->multi_bio) - bio_put(&dio->bio); - } else { - struct task_struct *waiter = dio->waiter; - - WRITE_ONCE(dio->waiter, NULL); - blk_wake_io_task(waiter); - } - } - - if (should_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(inode); - struct blk_plug plug; - struct blkdev_dio *dio; - struct bio *bio; - bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; - bool is_read = (iov_iter_rw(iter) == READ), is_sync; - loff_t pos = iocb->ki_pos; - blk_qc_t qc = BLK_QC_T_NONE; - int ret = 0; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); - - dio = container_of(bio, struct blkdev_dio, bio); - dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) { - dio->waiter = current; - bio_get(bio); - } else { - dio->iocb = iocb; - } - - dio->size = 0; - dio->multi_bio = false; - dio->should_dirty = is_read && iter_is_iovec(iter); - - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!is_poll) - blk_start_plug(&plug); - - for (;;) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = pos >> 9; - bio->bi_write_hint = iocb->ki_hint; - bio->bi_private = dio; - bio->bi_end_io = blkdev_bio_end_io; - bio->bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - break; - } - - if (is_read) { - bio->bi_opf = REQ_OP_READ; - if (dio->should_dirty) - bio_set_pages_dirty(bio); - } else { - bio->bi_opf = dio_bio_write_op(iocb); - task_io_account_write(bio->bi_iter.bi_size); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; - - dio->size += bio->bi_iter.bi_size; - pos += bio->bi_iter.bi_size; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); - if (!nr_pages) { - bool polled = false; - - if (iocb->ki_flags & IOCB_HIPRI) { - bio_set_polled(bio, iocb); - polled = true; - } - - qc = submit_bio(bio); - - if (polled) - WRITE_ONCE(iocb->ki_cookie, qc); - break; - } - - if (!dio->multi_bio) { - /* - * AIO needs an extra reference to ensure the dio - * structure which is embedded into the first bio - * stays around. - */ - if (!is_sync) - bio_get(bio); - dio->multi_bio = true; - atomic_set(&dio->ref, 2); - } else { - atomic_inc(&dio->ref); - } - - submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, nr_pages); - } - - if (!is_poll) - blk_finish_plug(&plug); - - if (!is_sync) - return -EIOCBQUEUED; - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(dio->waiter)) - break; - - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - if (!ret) - ret = blk_status_to_errno(dio->bio.bi_status); - if (likely(!ret)) - ret = dio->size; - - bio_put(&dio->bio); - return ret; -} - -static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - unsigned int nr_pages; - - if (!iov_iter_count(iter)) - return 0; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); -} - -static __init int blkdev_init(void) -{ - return bioset_init(&blkdev_dio_pool, 4, - offsetof(struct blkdev_dio, bio), - BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); -} -module_init(blkdev_init); - -int __sync_blockdev(struct block_device *bdev, int wait) -{ - if (!bdev) - return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); -} - -/* - * Write out and wait upon all the dirty data associated with a block - * device via its mapping. Does not take the superblock lock. - */ -int sync_blockdev(struct block_device *bdev) -{ - return __sync_blockdev(bdev, 1); -} -EXPORT_SYMBOL(sync_blockdev); - -/* - * Write out and wait upon all dirty data associated with this - * device. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_bdev(struct block_device *bdev) -{ - struct super_block *sb = get_super(bdev); - if (sb) { - int res = sync_filesystem(sb); - drop_super(sb); - return res; - } - return sync_blockdev(bdev); -} -EXPORT_SYMBOL(fsync_bdev); - -/** - * freeze_bdev -- lock a filesystem and force it into a consistent state - * @bdev: blockdevice to lock - * - * If a superblock is found on this device, we take the s_umount semaphore - * on it to make sure nobody unmounts until the snapshot creation is done. - * The reference counter (bd_fsfreeze_count) guarantees that only the last - * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze - * actually. - */ -int freeze_bdev(struct block_device *bdev) -{ - struct super_block *sb; - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; - - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb); - else - error = freeze_super(sb); - deactivate_super(sb); - - if (error) { - bdev->bd_fsfreeze_count--; - goto done; - } - bdev->bd_fsfreeze_sb = sb; - -sync: - sync_blockdev(bdev); -done: - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; -} -EXPORT_SYMBOL(freeze_bdev); - -/** - * thaw_bdev -- unlock filesystem - * @bdev: blockdevice to unlock - * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). - */ -int thaw_bdev(struct block_device *bdev) -{ - struct super_block *sb; - int error = -EINVAL; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) - goto out; - - error = 0; - if (--bdev->bd_fsfreeze_count > 0) - goto out; - - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; - - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb); - else - error = thaw_super(sb); - if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; -out: - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; -} -EXPORT_SYMBOL(thaw_bdev); - -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, blkdev_get_block, wbc); -} - -static int blkdev_readpage(struct file * file, struct page * page) -{ - return block_read_full_page(page, blkdev_get_block); -} - -static void blkdev_readahead(struct readahead_control *rac) -{ - mpage_readahead(rac, blkdev_get_block); -} - -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - return block_write_begin(mapping, pos, len, flags, pagep, - blkdev_get_block); -} - -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - int ret; - ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - unlock_page(page); - put_page(page); - - return ret; -} - -/* - * private llseek: - * for a block special file file_inode(file)->i_size is zero - * so we compute the size by hand (just as in block_read/write above) - */ -static loff_t block_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *bd_inode = bdev_file_inode(file); - loff_t retval; - - inode_lock(bd_inode); - retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - inode_unlock(bd_inode); - return retval; -} - -static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *bd_inode = bdev_file_inode(filp); - struct block_device *bdev = I_BDEV(bd_inode); - int error; - - error = file_write_and_wait_range(filp, start, end); - if (error) - return error; - - /* - * There is no need to serialise calls to blkdev_issue_flush with - * i_mutex and doing so causes performance issues with concurrent - * O_SYNC writers to a block device. - */ - error = blkdev_issue_flush(bdev); - if (error == -EOPNOTSUPP) - error = 0; - - return error; -} - -/** - * bdev_read_page() - Start reading a page from a block device - * @bdev: The device to read the page from - * @sector: The offset on the device to read the page to (need not be aligned) - * @page: The page to read - * - * On entry, the page should be locked. It will be unlocked when the page - * has been read. If the block driver implements rw_page synchronously, - * that will be true on exit from this function, but it need not be. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to read this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_read_page(struct block_device *bdev, sector_t sector, - struct page *page) -{ - const struct block_device_operations *ops = bdev->bd_disk->fops; - int result = -EOPNOTSUPP; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return result; - - result = blk_queue_enter(bdev->bd_disk->queue, 0); - if (result) - return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_READ); - blk_queue_exit(bdev->bd_disk->queue); - return result; -} - -/** - * bdev_write_page() - Start writing a page to a block device - * @bdev: The device to write the page to - * @sector: The offset on the device to write the page to (need not be aligned) - * @page: The page to write - * @wbc: The writeback_control for the write - * - * On entry, the page should be locked and not currently under writeback. - * On exit, if the write started successfully, the page will be unlocked and - * under writeback. If the write failed already (eg the driver failed to - * queue the page to the device), the page will still be locked. If the - * caller is a ->writepage implementation, it will need to unlock the page. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to write this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_write_page(struct block_device *bdev, sector_t sector, - struct page *page, struct writeback_control *wbc) -{ - int result; - const struct block_device_operations *ops = bdev->bd_disk->fops; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_disk->queue, 0); - if (result) - return result; - - set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_WRITE); - if (result) { - end_page_writeback(page); - } else { - clean_page_buffers(page); - unlock_page(page); - } - blk_queue_exit(bdev->bd_disk->queue); - return result; -} - -/* - * pseudo-fs - */ - -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); -static struct kmem_cache * bdev_cachep __read_mostly; - -static struct inode *bdev_alloc_inode(struct super_block *sb) -{ - struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); - - if (!ei) - return NULL; - memset(&ei->bdev, 0, sizeof(ei->bdev)); - return &ei->vfs_inode; -} - -static void bdev_free_inode(struct inode *inode) -{ - struct block_device *bdev = I_BDEV(inode); - - free_percpu(bdev->bd_stats); - kfree(bdev->bd_meta_info); - - if (!bdev_is_partition(bdev)) { - if (bdev->bd_disk && bdev->bd_disk->bdi) - bdi_put(bdev->bd_disk->bdi); - kfree(bdev->bd_disk); - } - - if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(bdev->bd_dev)); - - kmem_cache_free(bdev_cachep, BDEV_I(inode)); -} - -static void init_once(void *data) -{ - struct bdev_inode *ei = data; - - inode_init_once(&ei->vfs_inode); -} - -static void bdev_evict_inode(struct inode *inode) -{ - truncate_inode_pages_final(&inode->i_data); - invalidate_inode_buffers(inode); /* is it needed here? */ - clear_inode(inode); -} - -static const struct super_operations bdev_sops = { - .statfs = simple_statfs, - .alloc_inode = bdev_alloc_inode, - .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, - .evict_inode = bdev_evict_inode, -}; - -static int bd_init_fs_context(struct fs_context *fc) -{ - struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); - if (!ctx) - return -ENOMEM; - fc->s_iflags |= SB_I_CGROUPWB; - ctx->ops = &bdev_sops; - return 0; -} - -static struct file_system_type bd_type = { - .name = "bdev", - .init_fs_context = bd_init_fs_context, - .kill_sb = kill_anon_super, -}; - -struct super_block *blockdev_superblock __read_mostly; -EXPORT_SYMBOL_GPL(blockdev_superblock); - -void __init bdev_cache_init(void) -{ - int err; - static struct vfsmount *bd_mnt; - - bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), - 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), - init_once); - err = register_filesystem(&bd_type); - if (err) - panic("Cannot register bdev pseudo-fs"); - bd_mnt = kern_mount(&bd_type); - if (IS_ERR(bd_mnt)) - panic("Cannot create bdev pseudo-fs"); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ -} - -struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) -{ - struct block_device *bdev; - struct inode *inode; - - inode = new_inode(blockdev_superblock); - if (!inode) - return NULL; - inode->i_mode = S_IFBLK; - inode->i_rdev = 0; - inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - - bdev = I_BDEV(inode); - mutex_init(&bdev->bd_fsfreeze_mutex); - spin_lock_init(&bdev->bd_size_lock); - bdev->bd_disk = disk; - bdev->bd_partno = partno; - bdev->bd_inode = inode; - bdev->bd_stats = alloc_percpu(struct disk_stats); - if (!bdev->bd_stats) { - iput(inode); - return NULL; - } - return bdev; -} - -void bdev_add(struct block_device *bdev, dev_t dev) -{ - bdev->bd_dev = dev; - bdev->bd_inode->i_rdev = dev; - bdev->bd_inode->i_ino = dev; - insert_inode_hash(bdev->bd_inode); -} - -long nr_blockdev_pages(void) -{ - struct inode *inode; - long ret = 0; - - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) - ret += inode->i_mapping->nrpages; - spin_unlock(&blockdev_superblock->s_inode_list_lock); - - return ret; -} - -/** - * bd_may_claim - test whether a block device can be claimed - * @bdev: block device of interest - * @whole: whole block device containing @bdev, may equal @bdev - * @holder: holder trying to claim @bdev - * - * Test whether @bdev can be claimed by @holder. - * - * CONTEXT: - * spin_lock(&bdev_lock). - * - * RETURNS: - * %true if @bdev can be claimed, %false otherwise. - */ -static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, - void *holder) -{ - if (bdev->bd_holder == holder) - return true; /* already a holder */ - else if (bdev->bd_holder != NULL) - return false; /* held by someone else */ - else if (whole == bdev) - return true; /* is a whole device which isn't held */ - - else if (whole->bd_holder == bd_may_claim) - return true; /* is a partition of a device that is being partitioned */ - else if (whole->bd_holder != NULL) - return false; /* is a partition of a held device */ - else - return true; /* is a partition of an un-held device */ -} - -/** - * bd_prepare_to_claim - claim a block device - * @bdev: block device of interest - * @holder: holder trying to claim @bdev - * - * Claim @bdev. This function fails if @bdev is already claimed by another - * holder and waits if another claiming is in progress. return, the caller - * has ownership of bd_claiming and bd_holder[s]. - * - * RETURNS: - * 0 if @bdev can be claimed, -EBUSY otherwise. - */ -int bd_prepare_to_claim(struct block_device *bdev, void *holder) -{ - struct block_device *whole = bdev_whole(bdev); - - if (WARN_ON_ONCE(!holder)) - return -EINVAL; -retry: - spin_lock(&bdev_lock); - /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) { - spin_unlock(&bdev_lock); - return -EBUSY; - } - - /* if claiming is already in progress, wait for it to finish */ - if (whole->bd_claiming) { - wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); - DEFINE_WAIT(wait); - - prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&bdev_lock); - schedule(); - finish_wait(wq, &wait); - goto retry; - } - - /* yay, all mine */ - whole->bd_claiming = holder; - spin_unlock(&bdev_lock); - return 0; -} -EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ - -static void bd_clear_claiming(struct block_device *whole, void *holder) -{ - lockdep_assert_held(&bdev_lock); - /* tell others that we're done */ - BUG_ON(whole->bd_claiming != holder); - whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); -} - -/** - * bd_finish_claiming - finish claiming of a block device - * @bdev: block device of interest - * @holder: holder that has claimed @bdev - * - * Finish exclusive open of a block device. Mark the device as exlusively - * open by the holder and wake up all waiters for exclusive open to finish. - */ -static void bd_finish_claiming(struct block_device *bdev, void *holder) -{ - struct block_device *whole = bdev_whole(bdev); - - spin_lock(&bdev_lock); - BUG_ON(!bd_may_claim(bdev, whole, holder)); - /* - * Note that for a whole device bd_holders will be incremented twice, - * and bd_holder will be set to bd_may_claim before being set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_may_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - bd_clear_claiming(whole, holder); - spin_unlock(&bdev_lock); -} - -/** - * bd_abort_claiming - abort claiming of a block device - * @bdev: block device of interest - * @holder: holder that has claimed @bdev - * - * Abort claiming of a block device when the exclusive open failed. This can be - * also used when exclusive open is not actually desired and we just needed - * to block other exclusive openers for a while. - */ -void bd_abort_claiming(struct block_device *bdev, void *holder) -{ - spin_lock(&bdev_lock); - bd_clear_claiming(bdev_whole(bdev), holder); - spin_unlock(&bdev_lock); -} -EXPORT_SYMBOL(bd_abort_claiming); - -static void blkdev_flush_mapping(struct block_device *bdev) -{ - WARN_ON_ONCE(bdev->bd_holders); - sync_blockdev(bdev); - kill_bdev(bdev); - bdev_write_inode(bdev); -} - -static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - int ret = 0; - - if (disk->fops->open) { - ret = disk->fops->open(bdev, mode); - if (ret) { - /* avoid ghost partitions on a removed medium */ - if (ret == -ENOMEDIUM && - test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, true); - return ret; - } - } - - if (!bdev->bd_openers) - set_init_blocksize(bdev); - if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, false); - bdev->bd_openers++; - return 0;; -} - -static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) -{ - if (!--bdev->bd_openers) - blkdev_flush_mapping(bdev); - if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk, mode); -} - -static int blkdev_get_part(struct block_device *part, fmode_t mode) -{ - struct gendisk *disk = part->bd_disk; - int ret; - - if (part->bd_openers) - goto done; - - ret = blkdev_get_whole(bdev_whole(part), mode); - if (ret) - return ret; - - ret = -ENXIO; - if (!bdev_nr_sectors(part)) - goto out_blkdev_put; - - disk->open_partitions++; - set_init_blocksize(part); -done: - part->bd_openers++; - return 0; - -out_blkdev_put: - blkdev_put_whole(bdev_whole(part), mode); - return ret; -} - -static void blkdev_put_part(struct block_device *part, fmode_t mode) -{ - struct block_device *whole = bdev_whole(part); - - if (--part->bd_openers) - return; - blkdev_flush_mapping(part); - whole->bd_disk->open_partitions--; - blkdev_put_whole(whole, mode); -} - -struct block_device *blkdev_get_no_open(dev_t dev) -{ - struct block_device *bdev; - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) { - blk_request_module(dev); - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - } - - /* switch from the inode reference to a device mode one: */ - bdev = &BDEV_I(inode)->bdev; - if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) - bdev = NULL; - iput(inode); - - if (!bdev) - return NULL; - if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || - !try_module_get(bdev->bd_disk->fops->owner)) { - put_device(&bdev->bd_device); - return NULL; - } - - return bdev; -} - -void blkdev_put_no_open(struct block_device *bdev) -{ - module_put(bdev->bd_disk->fops->owner); - put_device(&bdev->bd_device); -} - -/** - * blkdev_get_by_dev - open a block device by device number - * @dev: device number of block device to open - * @mode: FMODE_* mask - * @holder: exclusive holder identifier - * - * Open the block device described by device number @dev. If @mode includes - * %FMODE_EXCL, the block device is opened with exclusive access. Specifying - * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for - * the same @holder. - * - * Use this interface ONLY if you really do not have anything better - i.e. when - * you are behind a truly sucky interface and all you are given is a device - * number. Everything else should use blkdev_get_by_path(). - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) -{ - bool unblock_events = true; - struct block_device *bdev; - struct gendisk *disk; - int ret; - - ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, - MAJOR(dev), MINOR(dev), - ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | - ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); - if (ret) - return ERR_PTR(ret); - - bdev = blkdev_get_no_open(dev); - if (!bdev) - return ERR_PTR(-ENXIO); - disk = bdev->bd_disk; - - if (mode & FMODE_EXCL) { - ret = bd_prepare_to_claim(bdev, holder); - if (ret) - goto put_blkdev; - } - - disk_block_events(disk); - - mutex_lock(&disk->open_mutex); - ret = -ENXIO; - if (!disk_live(disk)) - goto abort_claiming; - if (bdev_is_partition(bdev)) - ret = blkdev_get_part(bdev, mode); - else - ret = blkdev_get_whole(bdev, mode); - if (ret) - goto abort_claiming; - if (mode & FMODE_EXCL) { - bd_finish_claiming(bdev, holder); - - /* - * Block event polling for write claims if requested. Any write - * holder makes the write_holder state stick until all are - * released. This is good enough and tracking individual - * writeable reference is too fragile given the way @mode is - * used in blkdev_get/put(). - */ - if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; - unblock_events = false; - } - } - mutex_unlock(&disk->open_mutex); - - if (unblock_events) - disk_unblock_events(disk); - return bdev; - -abort_claiming: - if (mode & FMODE_EXCL) - bd_abort_claiming(bdev, holder); - mutex_unlock(&disk->open_mutex); - disk_unblock_events(disk); -put_blkdev: - blkdev_put_no_open(bdev); - return ERR_PTR(ret); -} -EXPORT_SYMBOL(blkdev_get_by_dev); - -/** - * blkdev_get_by_path - open a block device by name - * @path: path to the block device to open - * @mode: FMODE_* mask - * @holder: exclusive holder identifier - * - * Open the block device described by the device file at @path. If @mode - * includes %FMODE_EXCL, the block device is opened with exclusive access. - * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may - * nest for the same @holder. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder) -{ - struct block_device *bdev; - dev_t dev; - int error; - - error = lookup_bdev(path, &dev); - if (error) - return ERR_PTR(error); - - bdev = blkdev_get_by_dev(dev, mode, holder); - if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, mode); - return ERR_PTR(-EACCES); - } - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_path); - -static int blkdev_open(struct inode * inode, struct file * filp) -{ - struct block_device *bdev; - - /* - * Preserve backwards compatibility and allow large file access - * even if userspace doesn't ask for it explicitly. Some mkfs - * binary needs it. We might want to drop this workaround - * during an unstable branch. - */ - filp->f_flags |= O_LARGEFILE; - - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - - if (filp->f_flags & O_NDELAY) - filp->f_mode |= FMODE_NDELAY; - if (filp->f_flags & O_EXCL) - filp->f_mode |= FMODE_EXCL; - if ((filp->f_flags & O_ACCMODE) == 3) - filp->f_mode |= FMODE_WRITE_IOCTL; - - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - filp->f_mapping = bdev->bd_inode->i_mapping; - filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - return 0; -} - -void blkdev_put(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - - /* - * Sync early if it looks like we're the last one. If someone else - * opens the block device between now and the decrement of bd_openers - * then we did a sync that we didn't need to, but that's not the end - * of the world and we want to avoid long (could be several minute) - * syncs while holding the mutex. - */ - if (bdev->bd_openers == 1) - sync_blockdev(bdev); - - mutex_lock(&disk->open_mutex); - if (mode & FMODE_EXCL) { - struct block_device *whole = bdev_whole(bdev); - bool bdev_free; - - /* - * Release a claim on the device. The holder fields - * are protected with bdev_lock. open_mutex is to - * synchronize disk_holder unlinking. - */ - spin_lock(&bdev_lock); - - WARN_ON_ONCE(--bdev->bd_holders < 0); - WARN_ON_ONCE(--whole->bd_holders < 0); - - if ((bdev_free = !bdev->bd_holders)) - bdev->bd_holder = NULL; - if (!whole->bd_holders) - whole->bd_holder = NULL; - - spin_unlock(&bdev_lock); - - /* - * If this was the last claim, remove holder link and - * unblock evpoll if it was a write holder. - */ - if (bdev_free && bdev->bd_write_holder) { - disk_unblock_events(disk); - bdev->bd_write_holder = false; - } - } - - /* - * Trigger event checking and tell drivers to flush MEDIA_CHANGE - * event. This is to ensure detection of media removal commanded - * from userland - e.g. eject(1). - */ - disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - - if (bdev_is_partition(bdev)) - blkdev_put_part(bdev, mode); - else - blkdev_put_whole(bdev, mode); - mutex_unlock(&disk->open_mutex); - - blkdev_put_no_open(bdev); -} -EXPORT_SYMBOL(blkdev_put); - -static int blkdev_close(struct inode * inode, struct file * filp) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); - blkdev_put(bdev, filp->f_mode); - return 0; -} - -static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - return blkdev_ioctl(bdev, mode, cmd, arg); -} - -/* - * Write data to the block device. Only intended for the block device itself - * and the raw driver which basically is a fake block device. - * - * Does not take i_mutex for the write and thus is not for general purpose - * use. - */ -static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - struct blk_plug plug; - size_t shorted = 0; - ssize_t ret; - - if (bdev_read_only(I_BDEV(bd_inode))) - return -EPERM; - - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if (iocb->ki_pos >= size) - return -ENOSPC; - - if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) - return -EOPNOTSUPP; - - size -= iocb->ki_pos; - if (iov_iter_count(from) > size) { - shorted = iov_iter_count(from) - size; - iov_iter_truncate(from, size); - } - - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - iov_iter_reexpand(from, iov_iter_count(from) + shorted); - blk_finish_plug(&plug); - return ret; -} - -static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - loff_t pos = iocb->ki_pos; - size_t shorted = 0; - ssize_t ret; - - if (pos >= size) - return 0; - - size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; - iov_iter_truncate(to, size); - } - - ret = generic_file_read_iter(iocb, to); - iov_iter_reexpand(to, iov_iter_count(to) + shorted); - return ret; -} - -static int blkdev_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return generic_writepages(mapping, wbc); -} - -static const struct address_space_operations def_blk_aops = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = blkdev_readpage, - .readahead = blkdev_readahead, - .writepage = blkdev_writepage, - .write_begin = blkdev_write_begin, - .write_end = blkdev_write_end, - .writepages = blkdev_writepages, - .direct_IO = blkdev_direct_IO, - .migratepage = buffer_migrate_page_norefs, - .is_dirty_writeback = buffer_check_dirty_writeback, -}; - -#define BLKDEV_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) - -static long blkdev_fallocate(struct file *file, int mode, loff_t start, - loff_t len) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - loff_t end = start + len - 1; - loff_t isize; - int error; - - /* Fail if we don't recognize the flags. */ - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - - /* Don't go off the end of the device. */ - isize = i_size_read(bdev->bd_inode); - if (start >= isize) - return -EINVAL; - if (end >= isize) { - if (mode & FALLOC_FL_KEEP_SIZE) { - len = isize - start; - end = start + len - 1; - } else - return -EINVAL; - } - - /* - * Don't allow IO that isn't aligned to logical block size. - */ - if ((start | len) & (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - /* Invalidate the page cache, including dirty pages. */ - error = truncate_bdev_range(bdev, file->f_mode, start, end); - if (error) - return error; - - switch (mode) { - case FALLOC_FL_ZERO_RANGE: - case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); - break; - default: - return -EOPNOTSUPP; - } - if (error) - return error; - - /* - * Invalidate the page cache again; if someone wandered in and dirtied - * a page, we just discard it - userspace has no way of knowing whether - * the write happened before or after discard completing... - */ - return truncate_bdev_range(bdev, file->f_mode, start, end); -} - -const struct file_operations def_blk_fops = { - .open = blkdev_open, - .release = blkdev_close, - .llseek = block_llseek, - .read_iter = blkdev_read_iter, - .write_iter = blkdev_write_iter, - .iopoll = blkdev_iopoll, - .mmap = generic_file_mmap, - .fsync = blkdev_fsync, - .unlocked_ioctl = block_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = compat_blkdev_ioctl, -#endif - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = blkdev_fallocate, -}; - -/** - * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device - * @dev: return value of the block device's dev_t - * - * Get a reference to the blockdevice at @pathname in the current - * namespace if possible and return it. Return ERR_PTR(error) - * otherwise. - */ -int lookup_bdev(const char *pathname, dev_t *dev) -{ - struct inode *inode; - struct path path; - int error; - - if (!pathname || !*pathname) - return -EINVAL; - - error = kern_path(pathname, LOOKUP_FOLLOW, &path); - if (error) - return error; - - inode = d_backing_inode(path.dentry); - error = -ENOTBLK; - if (!S_ISBLK(inode->i_mode)) - goto out_path_put; - error = -EACCES; - if (!may_open_dev(&path)) - goto out_path_put; - - *dev = inode->i_rdev; - error = 0; -out_path_put: - path_put(&path); - return error; -} -EXPORT_SYMBOL(lookup_bdev); - -int __invalidate_device(struct block_device *bdev, bool kill_dirty) -{ - struct super_block *sb = get_super(bdev); - int res = 0; - - if (sb) { - /* - * no need to lock the super, get_super holds the - * read mutex so the filesystem cannot go away - * under us (->put_super runs with the write lock - * hold). - */ - shrink_dcache_sb(sb); - res = invalidate_inodes(sb, kill_dirty); - drop_super(sb); - } - invalidate_bdev(bdev); - return res; -} -EXPORT_SYMBOL(__invalidate_device); - -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) -{ - struct inode *inode, *old_inode = NULL; - - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { - struct address_space *mapping = inode->i_mapping; - struct block_device *bdev; - - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || - mapping->nrpages == 0) { - spin_unlock(&inode->i_lock); - continue; - } - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&blockdev_superblock->s_inode_list_lock); - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - iput(old_inode); - old_inode = inode; - bdev = I_BDEV(inode); - - mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); - mutex_unlock(&bdev->bd_disk->open_mutex); - - spin_lock(&blockdev_superblock->s_inode_list_lock); - } - spin_unlock(&blockdev_superblock->s_inode_list_lock); - iput(old_inode); -} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2f9515dccce0..355ea88d5c5f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3314,6 +3314,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + /* + * Flag our filesystem as having big metadata blocks if they are bigger + * than the page size. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + btrfs_info(fs_info, + "flagging fs with big metadata feature"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + + /* Set up fs_info before parsing mount options */ + nodesize = btrfs_super_nodesize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = sectorsize; + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); + + fs_info->nodesize = nodesize; + fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; + fs_info->stripesize = stripesize; + ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; @@ -3341,30 +3365,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_info(fs_info, "has skinny extents"); /* - * flag our filesystem as having big metadata blocks if - * they are bigger than the page size - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } - - nodesize = btrfs_super_nodesize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = sectorsize; - fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); - fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); - - /* Cache block sizes */ - fs_info->nodesize = nodesize; - fs_info->sectorsize = sectorsize; - fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; - fs_info->stripesize = stripesize; - - /* * mixed block groups end up with duplicate but slightly offset * extent buffers for the same range. It leads to corruptions */ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41524f9aeac3..cc61813213d8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3223,6 +3223,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; + struct block_device *bdev = NULL; + fmode_t mode; int ret; bool cancel = false; @@ -3255,9 +3257,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) /* Exclusive operation is now claimed */ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) - ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); + ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode); else - ret = btrfs_rm_device(fs_info, vol_args->name, 0); + ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); btrfs_exclop_finish(fs_info); @@ -3273,6 +3275,8 @@ out: kfree(vol_args); err_drop: mnt_drop_write_file(file); + if (bdev) + blkdev_put(bdev, mode); return ret; } @@ -3281,6 +3285,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; + struct block_device *bdev = NULL; + fmode_t mode; int ret; bool cancel; @@ -3302,7 +3308,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, vol_args->name, 0); + ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); @@ -3311,7 +3317,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out_drop_write: mnt_drop_write_file(file); - + if (bdev) + blkdev_put(bdev, mode); return ret; } diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 6461ebc3a1c1..340f995652f2 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -5,7 +5,7 @@ #include <linux/sched.h> #include <linux/wait.h> -#include <asm/div64.h> +#include <linux/math64.h> #include <linux/rbtree.h> #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index edb65abf0393..6b51fd2ec5ac 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1049,6 +1049,7 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, u64 len) { struct inode *inode = ordered->inode; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u64 file_offset = ordered->file_offset + pos; u64 disk_bytenr = ordered->disk_bytenr + pos; u64 num_bytes = len; @@ -1066,6 +1067,13 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, else type = __ffs(flags_masked); + /* + * The splitting extent is already counted and will be added again + * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid + * double counting. + */ + percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes, + fs_info->delalloc_batch); if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { WARN_ON_ONCE(1); ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ec3a874165de..464485aa7318 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -558,6 +558,8 @@ static int btrfs_free_stale_devices(const char *path, struct btrfs_device *device, *tmp_device; int ret = 0; + lockdep_assert_held(&uuid_mutex); + if (path) ret = -ENOENT; @@ -988,11 +990,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *orig_dev; int ret = 0; + lockdep_assert_held(&uuid_mutex); + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; - mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { @@ -1024,10 +1027,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } - mutex_unlock(&orig->device_list_mutex); return fs_devices; error: - mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(ret); } @@ -1869,15 +1870,17 @@ out: * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. */ -static void update_dev_time(const char *path_name) +static void update_dev_time(struct block_device *bdev) { - struct file *filp; + struct inode *inode = bdev->bd_inode; + struct timespec64 now; - filp = filp_open(path_name, O_RDWR, 0); - if (IS_ERR(filp)) + /* Shouldn't happen but just in case. */ + if (!inode) return; - file_update_time(filp); - filp_close(filp, NULL); + + now = current_time(inode); + generic_update_time(inode, &now, S_MTIME | S_CTIME); } static int btrfs_rm_dev_item(struct btrfs_device *device) @@ -2053,11 +2056,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + update_dev_time(bdev); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid, struct block_device **bdev, fmode_t *mode) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2171,15 +2174,26 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&fs_devices->device_list_mutex); /* - * at this point, the device is zero sized and detached from - * the devices list. All that's left is to zero out the old - * supers and free the device. + * At this point, the device is zero sized and detached from the + * devices list. All that's left is to zero out the old supers and + * free the device. + * + * We cannot call btrfs_close_bdev() here because we're holding the sb + * write lock, and blkdev_put() will pull in the ->open_mutex on the + * block device and it's dependencies. Instead just flush the device + * and let the caller do the final blkdev_put. */ - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device->bdev, device->name->str); + if (device->bdev) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + } - btrfs_close_bdev(device); + *bdev = device->bdev; + *mode = device->mode; synchronize_rcu(); btrfs_free_device(device); @@ -2706,7 +2720,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_forget_devices(device_path); /* Update ctime/mtime for blkid or udev */ - update_dev_time(device_path); + update_dev_time(bdev); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b082250b42e0..2183361db614 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -472,7 +472,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u8 *uuid); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, - const char *device_path, u64 devid); + const char *device_path, u64 devid, + struct block_device **bdev, fmode_t *mode); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7e7a897ae0d3..99b80b5c7a93 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1281,8 +1281,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); - /* zero the stale part of the page if we did a short copy */ if (!PageUptodate(page)) { + /* just return that nothing was copied on a short copy */ if (copied < len) { copied = 0; goto out; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 1409d6149281..058ea2a04376 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -26,12 +26,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); -int ceph_readpage_from_fscache(struct inode *inode, struct page *page); -int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages); - static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { ci->fscache = NULL; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 39db97f149b9..6c0e52fd0743 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -703,29 +703,12 @@ void ceph_add_cap(struct inode *inode, */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); - if (realm) { - struct ceph_snap_realm *oldrealm = ci->i_snap_realm; - if (oldrealm) { - spin_lock(&oldrealm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - spin_unlock(&oldrealm->inodes_with_caps_lock); - } - - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - if (realm->ino == ci->i_vino.ino) - realm->inode = inode; - spin_unlock(&realm->inodes_with_caps_lock); - - if (oldrealm) - ceph_put_snap_realm(mdsc, oldrealm); - } else { - pr_err("ceph_add_cap: couldn't find snap realm %llx\n", - realmino); - WARN_ON(!realm); - } + if (realm) + ceph_change_snap_realm(inode, realm); + else + WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", + __func__, realmino, ci->i_vino.ino, + ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); @@ -1112,20 +1095,6 @@ int ceph_is_any_caps(struct inode *inode) return ret; } -static void drop_inode_snap_realm(struct ceph_inode_info *ci) -{ - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, - realm); -} - /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * @@ -1145,17 +1114,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) return; } + lockdep_assert_held(&ci->i_ceph_lock); + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) { - WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) && - !mdsc->fsc->blocklisted); + if (ci->i_auth_cap == cap) ci->i_auth_cap = NULL; - } /* remove from session list */ spin_lock(&session->s_cap_lock); @@ -1201,12 +1169,34 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) - drop_inode_snap_realm(ci); + ceph_change_snap_realm(&ci->vfs_inode, NULL); __cap_delay_cancel(mdsc, ci); } } +void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) +{ + struct ceph_inode_info *ci = cap->ci; + struct ceph_fs_client *fsc; + + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + dout("%s: cap inode is NULL\n", __func__); + return; + } + + lockdep_assert_held(&ci->i_ceph_lock); + + fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + WARN_ON_ONCE(ci->i_auth_cap == cap && + !list_empty(&ci->i_dirty_item) && + !fsc->blocklisted && + READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); + + __ceph_remove_cap(cap, queue_release); +} + struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; @@ -1335,7 +1325,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); - __ceph_remove_cap(cap, true); + ceph_remove_cap(cap, true); } spin_unlock(&ci->i_ceph_lock); } @@ -1746,6 +1736,9 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void) struct ceph_cap_flush *cf; cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + if (!cf) + return NULL; + cf->is_capsnap = false; return cf; } @@ -1856,6 +1849,8 @@ static u64 __mark_caps_flushing(struct inode *inode, * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; @@ -2219,6 +2214,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) */ static int unsafe_request_wait(struct inode *inode) { + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; @@ -2238,6 +2234,81 @@ static int unsafe_request_wait(struct inode *inode) } spin_unlock(&ci->i_unsafe_lock); + /* + * Trigger to flush the journal logs in all the relevant MDSes + * manually, or in the worst case we must wait at most 5 seconds + * to wait the journal logs to be flushed by the MDSes periodically. + */ + if (req1 || req2) { + struct ceph_mds_session **sessions = NULL; + struct ceph_mds_session *s; + struct ceph_mds_request *req; + unsigned int max; + int i; + + /* + * The mdsc->max_sessions is unlikely to be changed + * mostly, here we will retry it by reallocating the + * sessions arrary memory to get rid of the mdsc->mutex + * lock. + */ +retry: + max = mdsc->max_sessions; + sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO); + if (!sessions) + return -ENOMEM; + + spin_lock(&ci->i_unsafe_lock); + if (req1) { + list_for_each_entry(req, &ci->i_unsafe_dirops, + r_unsafe_dir_item) { + s = req->r_session; + if (unlikely(s->s_mds > max)) { + spin_unlock(&ci->i_unsafe_lock); + goto retry; + } + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + if (req2) { + list_for_each_entry(req, &ci->i_unsafe_iops, + r_unsafe_target_item) { + s = req->r_session; + if (unlikely(s->s_mds > max)) { + spin_unlock(&ci->i_unsafe_lock); + goto retry; + } + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + spin_unlock(&ci->i_unsafe_lock); + + /* the auth MDS */ + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) { + s = ci->i_auth_cap->session; + if (!sessions[s->s_mds]) + sessions[s->s_mds] = ceph_get_mds_session(s); + } + spin_unlock(&ci->i_ceph_lock); + + /* send flush mdlog request to MDSes */ + for (i = 0; i < max; i++) { + s = sessions[i]; + if (s) { + send_flush_mdlog(s); + ceph_put_mds_session(s); + } + } + kfree(sessions); + } + dout("unsafe_request_wait %p wait on tid %llu %llu\n", inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { @@ -3008,7 +3079,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) - drop_inode_snap_realm(ci); + ceph_change_snap_realm(inode, NULL); } } if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { @@ -3114,7 +3185,16 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, break; } } - BUG_ON(!found); + + if (!found) { + /* + * The capsnap should already be removed when removing + * auth cap in the case of a forced unmount. + */ + WARN_ON_ONCE(ci->i_auth_cap); + goto unlock; + } + capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = true; @@ -3136,6 +3216,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, complete_capsnap ? " (complete capsnap)" : ""); } +unlock: spin_unlock(&ci->i_ceph_lock); if (last) { @@ -3606,6 +3687,43 @@ out: iput(inode); } +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + bool ret; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); + + list_del_init(&capsnap->ci_item); + ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); + if (wake_ci) + *wake_ci = ret; + + spin_lock(&mdsc->cap_dirty_lock); + if (list_empty(&ci->i_cap_flush_list)) + list_del_init(&ci->i_flushing_item); + + ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); + if (wake_mdsc) + *wake_mdsc = ret; + spin_unlock(&mdsc->cap_dirty_lock); +} + +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + lockdep_assert_held(&ci->i_ceph_lock); + + WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); + __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); +} + /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. @@ -3643,23 +3761,10 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap, capsnap->follows); } } - if (flushed) { - WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing %p cap_snap %p follows %lld\n", - inode, capsnap, follows); - list_del(&capsnap->ci_item); - wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); - - spin_lock(&mdsc->cap_dirty_lock); - - if (list_empty(&ci->i_cap_flush_list)) - list_del_init(&ci->i_flushing_item); - - wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, - &capsnap->cap_flush); - spin_unlock(&mdsc->cap_dirty_lock); - } + if (flushed) + ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); + if (flushed) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); @@ -3743,7 +3848,7 @@ retry: goto out_unlock; if (target < 0) { - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } @@ -3778,7 +3883,7 @@ retry: change_auth_cap_ses(ci, tcap->session); } } - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export tagert */ @@ -3795,7 +3900,7 @@ retry: spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } @@ -3906,7 +4011,7 @@ retry: ocap->mseq, mds, le32_to_cpu(ph->seq), le32_to_cpu(ph->mseq)); } - __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; @@ -4134,8 +4239,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, done: mutex_unlock(&session->s_mutex); done_unlocked: - ceph_put_string(extra_info.pool_ns); iput(inode); +out: + ceph_put_string(extra_info.pool_ns); return; flush_cap_releases: @@ -4150,7 +4256,7 @@ flush_cap_releases: bad: pr_err("ceph_handle_caps: corrupt message\n"); ceph_msg_dump(msg); - return; + goto out; } /* @@ -4225,33 +4331,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) dout("flush_dirty_caps done\n"); } -static void iterate_sessions(struct ceph_mds_client *mdsc, - void (*cb)(struct ceph_mds_session *)) -{ - int mds; - - mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; ++mds) { - struct ceph_mds_session *s; - - if (!mdsc->sessions[mds]) - continue; - - s = ceph_get_mds_session(mdsc->sessions[mds]); - if (!s) - continue; - - mutex_unlock(&mdsc->mutex); - cb(s); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); -} - void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - iterate_sessions(mdsc, flush_dirty_session_caps); + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } void __ceph_touch_fmode(struct ceph_inode_info *ci, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e1d605a02d4a..d16fd2d5fd42 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1722,32 +1722,26 @@ retry_snap: goto out; } - err = file_remove_privs(file); - if (err) + down_read(&osdc->lock); + map_flags = osdc->osdmap->flags; + pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); + up_read(&osdc->lock); + if ((map_flags & CEPH_OSDMAP_FULL) || + (pool_flags & CEPH_POOL_FLAG_FULL)) { + err = -ENOSPC; goto out; + } - err = file_update_time(file); + err = file_remove_privs(file); if (err) goto out; - inode_inc_iversion_raw(inode); - if (ci->i_inline_version != CEPH_INLINE_NONE) { err = ceph_uninline_data(file, NULL); if (err < 0) goto out; } - down_read(&osdc->lock); - map_flags = osdc->osdmap->flags; - pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); - up_read(&osdc->lock); - if ((map_flags & CEPH_OSDMAP_FULL) || - (pool_flags & CEPH_POOL_FLAG_FULL)) { - err = -ENOSPC; - goto out; - } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) @@ -1759,6 +1753,12 @@ retry_snap: if (err < 0) goto out; + err = file_update_time(file); + if (err) + goto out_caps; + + inode_inc_iversion_raw(inode); + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); @@ -1842,6 +1842,8 @@ retry_snap: } goto out_unlocked; +out_caps: + ceph_put_cap_refs(ci, got); out: if (direct_lock) ceph_end_io_direct(inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1bd2cc015913..2df1e1284451 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -581,16 +581,9 @@ void ceph_evict_inode(struct inode *inode) */ if (ci->i_snap_realm) { if (ceph_snap(inode) == CEPH_NOSNAP) { - struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", - realm); - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); + ci->i_snap_realm); + ceph_change_snap_realm(inode, NULL); } else { ceph_put_snapid_map(mdsc, ci->i_snapid_map); ci->i_snap_realm = NULL; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0b69aec23e5c..7cad180d6deb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/bits.h> #include <linux/ktime.h> +#include <linux/bitmap.h> #include "super.h" #include "mds_client.h" @@ -652,14 +653,9 @@ const char *ceph_session_state_name(int s) struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) { - if (refcount_inc_not_zero(&s->s_ref)) { - dout("mdsc get_session %p %d -> %d\n", s, - refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); + if (refcount_inc_not_zero(&s->s_ref)) return s; - } else { - dout("mdsc get_session %p 0 -- FAIL\n", s); - return NULL; - } + return NULL; } void ceph_put_mds_session(struct ceph_mds_session *s) @@ -667,8 +663,6 @@ void ceph_put_mds_session(struct ceph_mds_session *s) if (IS_ERR_OR_NULL(s)) return; - dout("mdsc put_session %p %d -> %d\n", s, - refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); @@ -743,8 +737,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; - s->s_ttl = 0; - s->s_seq = 0; mutex_init(&s->s_mutex); ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); @@ -753,17 +745,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); - s->s_renew_requested = 0; - s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); - s->s_nr_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); xa_init(&s->s_delegated_inos); - s->s_num_cap_releases = 0; - s->s_cap_reconnect = 0; - s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); @@ -811,6 +797,33 @@ static void put_request_session(struct ceph_mds_request *req) } } +void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state) +{ + int mds; + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; ++mds) { + struct ceph_mds_session *s; + + s = __ceph_lookup_mds_session(mdsc, mds); + if (!s) + continue; + + if (check_state && !check_session_state(s)) { + ceph_put_mds_session(s); + continue; + } + + mutex_unlock(&mdsc->mutex); + cb(s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + void ceph_mdsc_release_request(struct kref *kref) { struct ceph_mds_request *req = container_of(kref, @@ -1155,7 +1168,7 @@ random: /* * session messages */ -static struct ceph_msg *create_session_msg(u32 op, u64 seq) +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -1163,7 +1176,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, false); if (!msg) { - pr_err("create_session_msg ENOMEM creating msg\n"); + pr_err("ENOMEM creating session %s msg\n", + ceph_session_op_name(op)); return NULL; } h = msg->front.iov_base; @@ -1294,7 +1308,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); if (!msg) { - pr_err("create_session_msg ENOMEM creating msg\n"); + pr_err("ENOMEM creating session open msg\n"); return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; @@ -1583,14 +1597,39 @@ out: return ret; } +static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_snap *capsnap; + int capsnap_release = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); + + while (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + __ceph_remove_capsnap(inode, capsnap, NULL, NULL); + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + capsnap_release++; + } + wake_up_all(&ci->i_cap_wq); + wake_up_all(&mdsc->cap_flushing_wq); + return capsnap_release; +} + static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); bool dirty_dropped = false; bool invalidate = false; + int capsnap_release = 0; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1598,7 +1637,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = fsc->mdsc; if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { if (inode->i_data.nrpages > 0) @@ -1662,6 +1700,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (!list_empty(&ci->i_cap_snaps)) + capsnap_release = remove_capsnaps(mdsc, inode); } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1678,6 +1719,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ceph_queue_invalidate(inode); if (dirty_dropped) iput(inode); + while (capsnap_release--) + iput(inode); return 0; } @@ -1803,8 +1846,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, dout("send_renew_caps to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, - ++session->s_renew_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1818,7 +1861,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc, dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), seq); - msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1870,7 +1913,8 @@ static int request_close_session(struct ceph_mds_session *session) dout("request_close_session mds%d state %s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), session->s_seq); - msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, + session->s_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1965,7 +2009,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ - __ceph_remove_cap(cap, true); + ceph_remove_cap(cap, true); (*remaining)--; } else { struct dentry *dentry; @@ -4150,13 +4194,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, struct ceph_mdsmap *newmap, struct ceph_mdsmap *oldmap) { - int i; + int i, j, err; int oldstate, newstate; struct ceph_mds_session *s; + unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0}; dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); + if (newmap->m_info) { + for (i = 0; i < newmap->possible_max_rank; i++) { + for (j = 0; j < newmap->m_info[i].num_export_targets; j++) + set_bit(newmap->m_info[i].export_targets[j], targets); + } + } + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { if (!mdsc->sessions[i]) continue; @@ -4210,6 +4262,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, if (s->s_state == CEPH_MDS_SESSION_RESTARTING && newstate >= CEPH_MDS_STATE_RECONNECT) { mutex_unlock(&mdsc->mutex); + clear_bit(i, targets); send_mds_reconnect(mdsc, s); mutex_lock(&mdsc->mutex); } @@ -4232,6 +4285,51 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } + /* + * Only open and reconnect sessions that don't exist yet. + */ + for (i = 0; i < newmap->possible_max_rank; i++) { + /* + * In case the import MDS is crashed just after + * the EImportStart journal is flushed, so when + * a standby MDS takes over it and is replaying + * the EImportStart journal the new MDS daemon + * will wait the client to reconnect it, but the + * client may never register/open the session yet. + * + * Will try to reconnect that MDS daemon if the + * rank number is in the export targets array and + * is the up:reconnect state. + */ + newstate = ceph_mdsmap_get_state(newmap, i); + if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT) + continue; + + /* + * The session maybe registered and opened by some + * requests which were choosing random MDSes during + * the mdsc->mutex's unlock/lock gap below in rare + * case. But the related MDS daemon will just queue + * that requests and be still waiting for the client's + * reconnection request in up:reconnect state. + */ + s = __ceph_lookup_mds_session(mdsc, i); + if (likely(!s)) { + s = __open_export_target_session(mdsc, i); + if (IS_ERR(s)) { + err = PTR_ERR(s); + pr_err("failed to open export target session, err %d\n", + err); + continue; + } + } + dout("send reconnect to export target mds.%d\n", i); + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) @@ -4409,24 +4507,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, } /* - * lock unlock sessions, to wait ongoing session activities + * lock unlock the session, to wait ongoing session activities */ -static void lock_unlock_sessions(struct ceph_mds_client *mdsc) +static void lock_unlock_session(struct ceph_mds_session *s) { - int i; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) { - struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (!s) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - mutex_unlock(&s->s_mutex); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); } static void maybe_recover_session(struct ceph_mds_client *mdsc) @@ -4448,6 +4534,8 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { + struct ceph_fs_client *fsc = s->s_mdsc->fsc; + switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { @@ -4456,8 +4544,9 @@ bool check_session_state(struct ceph_mds_session *s) } break; case CEPH_MDS_SESSION_CLOSING: - /* Should never reach this when we're unmounting */ - WARN_ON_ONCE(s->s_ttl); + /* Should never reach this when not force unmounting */ + WARN_ON_ONCE(s->s_ttl && + READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); fallthrough; case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: @@ -4584,21 +4673,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); - mdsc->sessions = NULL; - atomic_set(&mdsc->num_sessions, 0); - mdsc->max_sessions = 0; - mdsc->stopping = 0; - atomic64_set(&mdsc->quotarealms_count, 0); mdsc->quotarealms_inodes = RB_ROOT; mutex_init(&mdsc->quotarealms_inodes_mutex); - mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); - mdsc->num_snap_realms = 0; spin_lock_init(&mdsc->snap_empty_lock); - mdsc->last_tid = 0; - mdsc->oldest_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; @@ -4610,11 +4690,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->last_cap_flush_tid = 1; INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); - mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); - atomic_set(&mdsc->cap_reclaim_pending, 0); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; @@ -4676,6 +4754,30 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests done\n"); } +void send_flush_mdlog(struct ceph_mds_session *s) +{ + struct ceph_msg *msg; + + /* + * Pre-luminous MDS crashes when it sees an unknown session request + */ + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) + return; + + mutex_lock(&s->s_mutex); + dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, + ceph_session_state_name(s->s_state), s->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, + s->s_seq); + if (!msg) { + pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + } else { + ceph_con_send(&s->s_con, msg); + } + mutex_unlock(&s->s_mutex); +} + /* * called before mount is ro, and before dentries are torn down. * (hmm, does this still race with new lookups?) @@ -4685,7 +4787,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) dout("pre_umount\n"); mdsc->stopping = 1; - lock_unlock_sessions(mdsc); + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); + ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); @@ -4912,7 +5015,6 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_metric_destroy(&mdsc->metric); - flush_delayed_work(&mdsc->metric.delayed_work); fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 20e42d8b66c6..97c7f7bfa55f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -522,6 +522,11 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern void send_flush_mdlog(struct ceph_mds_session *s); +extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state); +extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 3c444b9cb17b..61d67cbcb367 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -122,6 +122,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) int err; u8 mdsmap_v; u16 mdsmap_ev; + u32 target; m = kzalloc(sizeof(*m), GFP_NOFS); if (!m) @@ -260,9 +261,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) sizeof(u32), GFP_NOFS); if (!info->export_targets) goto nomem; - for (j = 0; j < num_export_targets; j++) - info->export_targets[j] = - ceph_decode_32(&pexport_targets); + for (j = 0; j < num_export_targets; j++) { + target = ceph_decode_32(&pexport_targets); + if (target >= m->possible_max_rank) { + err = -EIO; + goto corrupt; + } + info->export_targets[j] = target; + } } else { info->export_targets = NULL; } diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 5ac151eb0d49..04d5df29bbbf 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -302,6 +302,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m) if (!m) return; + cancel_delayed_work_sync(&m->delayed_work); + percpu_counter_destroy(&m->total_inodes); percpu_counter_destroy(&m->opened_inodes); percpu_counter_destroy(&m->i_caps_mis); @@ -309,8 +311,6 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); - cancel_delayed_work_sync(&m->delayed_work); - ceph_put_mds_session(m->session); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 15105f9da3fd..b41e6724c591 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -849,6 +849,43 @@ static void flush_snaps(struct ceph_mds_client *mdsc) dout("flush_snaps done\n"); } +/** + * ceph_change_snap_realm - change the snap_realm for an inode + * @inode: inode to move to new snap realm + * @realm: new realm to move inode into (may be NULL) + * + * Detach an inode from its old snaprealm (if any) and attach it to + * the new snaprealm (if any). The old snap realm reference held by + * the inode is put. If realm is non-NULL, then the caller's reference + * to it is taken over by the inode. + */ +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_snap_realm *oldrealm = ci->i_snap_realm; + + lockdep_assert_held(&ci->i_ceph_lock); + + if (oldrealm) { + spin_lock(&oldrealm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + if (oldrealm->ino == ci->i_vino.ino) + oldrealm->inode = NULL; + spin_unlock(&oldrealm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, oldrealm); + } + + ci->i_snap_realm = realm; + + if (realm) { + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; + spin_unlock(&realm->inodes_with_caps_lock); + } +} /* * Handle a snap notification from the MDS. @@ -935,7 +972,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; - struct ceph_snap_realm *oldrealm; if (!inode) continue; @@ -960,27 +996,10 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, } dout(" will move %p to split realm %llx %p\n", inode, realm->ino, realm); - /* - * Move the inode to the new realm - */ - oldrealm = ci->i_snap_realm; - spin_lock(&oldrealm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - spin_unlock(&oldrealm->inodes_with_caps_lock); - - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - if (realm->ino == ci->i_vino.ino) - realm->inode = inode; - spin_unlock(&realm->inodes_with_caps_lock); - - spin_unlock(&ci->i_ceph_lock); ceph_get_snap_realm(mdsc, realm); - ceph_put_snap_realm(mdsc, oldrealm); - + ceph_change_snap_realm(inode, realm); + spin_unlock(&ci->i_ceph_lock); iput(inode); continue; diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 4a79f3632260..573bb9556fb5 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -46,6 +46,7 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; case CEPH_SESSION_FORCE_RO: return "force_ro"; case CEPH_SESSION_REJECT: return "reject"; + case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog"; } return "???"; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c30258f95e37..a40eb14c282a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -418,7 +418,6 @@ struct ceph_inode_info { struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ }; - int i_snap_realm_counter; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; struct timespec64 i_btime; @@ -929,6 +928,7 @@ extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, extern int ceph_update_snap_trace(struct ceph_mds_client *m, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret); +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); @@ -1138,6 +1138,7 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); @@ -1163,6 +1164,12 @@ extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); +extern void __ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); +extern void ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 1242db8d3444..159a1ffa4f4b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -340,6 +340,18 @@ static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val, ceph_cap_string(issued), issued); } +static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = ceph_fmt_xattr(val, size, "%d", + ci->i_auth_cap ? ci->i_auth_cap->session->s_mds : -1); + spin_unlock(&ci->i_ceph_lock); + return ret; +} + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -473,6 +485,13 @@ static struct ceph_vxattr ceph_common_vxattrs[] = { .exists_cb = NULL, .flags = VXATTR_FLAG_READONLY, }, + { + .name = "ceph.auth_mds", + .name_size = sizeof("ceph.auth_mds"), + .getxattr_cb = ceph_vxattrcb_auth_mds, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, { .name = NULL, 0 } /* Required table terminator */ }; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 6679e07e533e..2e6f40344037 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -22,7 +22,7 @@ #include <linux/random.h> #include <linux/highmem.h> #include <linux/fips.h> -#include "../cifs_common/arc4.h" +#include "../smbfs_common/arc4.h" #include <crypto/aead.h> int __cifs_calc_signature(struct smb_rqst *rqst, diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index dc920e206336..98e8e5aa0613 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -12,7 +12,7 @@ #include <net/sock.h> #include <asm/unaligned.h> -#include "smbfsctl.h" +#include "../smbfs_common/smbfsctl.h" #define CIFS_PROT 0 #define POSIX_PROT (CIFS_PROT+1) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index ddc0e8f97872..bda606dc72b1 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -689,13 +689,19 @@ smb2_close_cached_fid(struct kref *ref) cifs_dbg(FYI, "clear cached root file handle\n"); SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, cfid->fid->volatile_fid); - cfid->is_valid = false; - cfid->file_all_info_is_valid = false; - cfid->has_lease = false; - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } + } + + /* + * We only check validity above to send SMB2_close, + * but we still need to invalidate these entries + * when this function is called + */ + cfid->is_valid = false; + cfid->file_all_info_is_valid = false; + cfid->has_lease = false; + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; } } diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 10047cc55286..4a0487753869 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -24,7 +24,7 @@ #include "cifsglob.h" #include "cifs_debug.h" #include "cifsproto.h" -#include "../cifs_common/md4.h" +#include "../smbfs_common/md4.h" #ifndef false #define false 0 diff --git a/fs/erofs/super.c b/fs/erofs/super.c index a8d49e8fc83a..11b88559f8bf 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -546,7 +546,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return err; if (test_opt(ctx, DAX_ALWAYS) && - !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) { + !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(ctx, DAX_ALWAYS); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 648ed77f4164..06f4c5ae1451 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1686,8 +1686,8 @@ static int ep_send_events(struct eventpoll *ep, if (!revents) continue; - if (__put_user(revents, &events->events) || - __put_user(epi->event.data, &events->data)) { + events = epoll_put_uevent(revents, epi->event.data, events); + if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) @@ -1695,7 +1695,6 @@ static int ep_send_events(struct eventpoll *ep, break; } res++; - events++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 987bcf32ed46..d8d580b609ba 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -946,7 +946,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (test_opt(sb, DAX)) { - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0, + bdev_nr_sectors(sb->s_bdev))) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); clear_opt(sbi->s_mount_opt, DAX); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 136940af00b8..0775950ee84e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4287,7 +4287,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (bdev_dax_supported(sb->s_bdev, blocksize)) + if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0, + bdev_nr_sectors(sb->s_bdev))) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { diff --git a/fs/file.c b/fs/file.c index d8afa8266859..8627dacfc424 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1150,6 +1150,12 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } +int receive_fd(struct file *file, unsigned int o_flags) +{ + return __receive_fd(file, NULL, o_flags); +} +EXPORT_SYMBOL_GPL(receive_fd); + static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/fs/filesystems.c b/fs/filesystems.c index 90b8d879fbaf..58b9067b2391 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -209,21 +209,28 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) } #endif -int __init get_filesystem_list(char *buf) +int __init list_bdev_fs_names(char *buf, size_t size) { - int len = 0; - struct file_system_type * tmp; + struct file_system_type *p; + size_t len; + int count = 0; read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp && len < PAGE_SIZE - 80) { - len += sprintf(buf+len, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + for (p = file_systems; p; p = p->next) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + continue; + len = strlen(p->name) + 1; + if (len > size) { + pr_warn("%s: truncating file system list\n", __func__); + break; + } + memcpy(buf, p->name, len); + buf += len; + size -= len; + count++; } read_unlock(&file_systems_lock); - return len; + return count; } #ifdef CONFIG_PROC_FS diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 980d44fd3a36..3df07c0e32b3 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -165,7 +165,6 @@ int fs_lookup_param(struct fs_context *fc, return invalf(fc, "%s: not usable as path", param->key); } - f->refcnt++; /* filename_lookup() drops our ref. */ ret = filename_lookup(param->dirfd, f, flags, _path, NULL); if (ret < 0) { errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6e15434b23ac..3130f85d2b3f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1985,8 +1985,8 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, if (error) goto out; - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + error = may_setattr(&init_user_ns, inode, attr->ia_valid); + if (error) goto error; error = setattr_prepare(&init_user_ns, dentry, attr); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 7d0c3dbb2898..d5c9d886cd9f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -381,6 +381,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, static const struct file_operations hostfs_file_fops = { .llseek = generic_file_llseek, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, diff --git a/fs/internal.h b/fs/internal.h index 68a2ae029a27..3cd065c8a66b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -18,7 +18,7 @@ struct user_namespace; struct pipe_inode_info; /* - * block_dev.c + * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); diff --git a/fs/io-wq.c b/fs/io-wq.c index d80e4a735677..6c55362c1f99 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -709,6 +709,7 @@ static void create_worker_cont(struct callback_head *cb) } raw_spin_unlock(&wqe->lock); io_worker_ref_put(wqe->wq); + kfree(worker); return; } @@ -725,6 +726,7 @@ static void io_workqueue_create(struct work_struct *work) if (!io_queue_worker_create(worker, acct, create_worker_cont)) { clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); + kfree(worker); } } @@ -759,6 +761,7 @@ fail: if (!IS_ERR(tsk)) { io_init_new_worker(wqe, worker, tsk); } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + kfree(worker); goto fail; } else { INIT_WORK(&worker->work, io_workqueue_create); @@ -832,6 +835,11 @@ append: wq_list_add_after(&work->list, &tail->list, &acct->work_list); } +static bool io_wq_work_match_item(struct io_wq_work *work, void *data) +{ + return work == data; +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -844,7 +852,6 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) */ if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || (work->flags & IO_WQ_WORK_CANCEL)) { -run_cancel: io_run_cancel(work, wqe); return; } @@ -864,15 +871,22 @@ run_cancel: bool did_create; did_create = io_wqe_create_worker(wqe, acct); - if (unlikely(!did_create)) { - raw_spin_lock(&wqe->lock); - /* fatal condition, failed to create the first worker */ - if (!acct->nr_workers) { - raw_spin_unlock(&wqe->lock); - goto run_cancel; - } - raw_spin_unlock(&wqe->lock); + if (likely(did_create)) + return; + + raw_spin_lock(&wqe->lock); + /* fatal condition, failed to create the first worker */ + if (!acct->nr_workers) { + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; + + if (io_acct_cancel_pending_work(wqe, acct, &match)) + raw_spin_lock(&wqe->lock); } + raw_spin_unlock(&wqe->lock); } } @@ -1122,7 +1136,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data) { struct io_worker *worker; - if (cb->func != create_worker_cb || cb->func != create_worker_cont) + if (cb->func != create_worker_cb && cb->func != create_worker_cont) return false; worker = container_of(cb, struct io_worker, create_work); return worker->wqe->wq == data; @@ -1143,9 +1157,14 @@ static void io_wq_exit_workers(struct io_wq *wq) while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; + struct io_wqe_acct *acct; worker = container_of(cb, struct io_worker, create_work); - atomic_dec(&worker->wqe->acct[worker->create_index].nr_running); + acct = io_wqe_get_acct(worker); + atomic_dec(&acct->nr_running); + raw_spin_lock(&worker->wqe->lock); + acct->nr_workers--; + raw_spin_unlock(&worker->wqe->lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); diff --git a/fs/io_uring.c b/fs/io_uring.c index d816c09c88a5..16fb7436043c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1482,6 +1482,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status) struct io_timeout_data *io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) != -1) { + if (status) + req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); @@ -1619,8 +1621,11 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { + /* see waitqueue_active() comment */ + smp_mb(); + if (ctx->flags & IORING_SETUP_SQPOLL) { - if (wq_has_sleeper(&ctx->cq_wait)) + if (waitqueue_active(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) @@ -3480,6 +3485,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (req->flags & REQ_F_NOWAIT) goto done; /* some cases will consume bytes even on error returns */ + iov_iter_reexpand(iter, iter->count + iter->truncated); iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; } else if (ret == -EIOCBQUEUED) { @@ -3619,6 +3625,7 @@ done: } else { copy_iov: /* some cases will consume bytes even on error returns */ + iov_iter_reexpand(iter, iter->count + iter->truncated); iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); return ret ?: -EAGAIN; @@ -10548,7 +10555,14 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { + /* + * Observe the correct sqd->lock -> ctx->uring_lock + * ordering. Fine to drop uring_lock here, we hold + * a ref to the ctx. + */ + mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); + mutex_lock(&ctx->uring_lock); tctx = sqd->thread->io_uring; } } else { @@ -10851,7 +10865,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); - BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 2243a2c64b37..8317f7ca402b 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -28,37 +28,60 @@ static int try_to_realloc_ndr_blob(struct ndr *n, size_t sz) return 0; } -static void ndr_write_int16(struct ndr *n, __u16 value) +static int ndr_write_int16(struct ndr *n, __u16 value) { - if (n->length <= n->offset + sizeof(value)) - try_to_realloc_ndr_blob(n, sizeof(value)); + if (n->length <= n->offset + sizeof(value)) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sizeof(value)); + if (ret) + return ret; + } *(__le16 *)ndr_get_field(n) = cpu_to_le16(value); n->offset += sizeof(value); + return 0; } -static void ndr_write_int32(struct ndr *n, __u32 value) +static int ndr_write_int32(struct ndr *n, __u32 value) { - if (n->length <= n->offset + sizeof(value)) - try_to_realloc_ndr_blob(n, sizeof(value)); + if (n->length <= n->offset + sizeof(value)) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sizeof(value)); + if (ret) + return ret; + } *(__le32 *)ndr_get_field(n) = cpu_to_le32(value); n->offset += sizeof(value); + return 0; } -static void ndr_write_int64(struct ndr *n, __u64 value) +static int ndr_write_int64(struct ndr *n, __u64 value) { - if (n->length <= n->offset + sizeof(value)) - try_to_realloc_ndr_blob(n, sizeof(value)); + if (n->length <= n->offset + sizeof(value)) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sizeof(value)); + if (ret) + return ret; + } *(__le64 *)ndr_get_field(n) = cpu_to_le64(value); n->offset += sizeof(value); + return 0; } static int ndr_write_bytes(struct ndr *n, void *value, size_t sz) { - if (n->length <= n->offset + sz) - try_to_realloc_ndr_blob(n, sz); + if (n->length <= n->offset + sz) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sz); + if (ret) + return ret; + } memcpy(ndr_get_field(n), value, sz); n->offset += sz; @@ -70,8 +93,13 @@ static int ndr_write_string(struct ndr *n, char *value) size_t sz; sz = strlen(value) + 1; - if (n->length <= n->offset + sz) - try_to_realloc_ndr_blob(n, sz); + if (n->length <= n->offset + sz) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sz); + if (ret) + return ret; + } memcpy(ndr_get_field(n), value, sz); n->offset += sz; @@ -81,9 +109,14 @@ static int ndr_write_string(struct ndr *n, char *value) static int ndr_read_string(struct ndr *n, void *value, size_t sz) { - int len = strnlen(ndr_get_field(n), sz); + int len; - memcpy(value, ndr_get_field(n), len); + if (n->offset + sz > n->length) + return -EINVAL; + + len = strnlen(ndr_get_field(n), sz); + if (value) + memcpy(value, ndr_get_field(n), len); len++; n->offset += len; n->offset = ALIGN(n->offset, 2); @@ -92,41 +125,52 @@ static int ndr_read_string(struct ndr *n, void *value, size_t sz) static int ndr_read_bytes(struct ndr *n, void *value, size_t sz) { - memcpy(value, ndr_get_field(n), sz); + if (n->offset + sz > n->length) + return -EINVAL; + + if (value) + memcpy(value, ndr_get_field(n), sz); n->offset += sz; return 0; } -static __u16 ndr_read_int16(struct ndr *n) +static int ndr_read_int16(struct ndr *n, __u16 *value) { - __u16 ret; + if (n->offset + sizeof(__u16) > n->length) + return -EINVAL; - ret = le16_to_cpu(*(__le16 *)ndr_get_field(n)); + if (value) + *value = le16_to_cpu(*(__le16 *)ndr_get_field(n)); n->offset += sizeof(__u16); - return ret; + return 0; } -static __u32 ndr_read_int32(struct ndr *n) +static int ndr_read_int32(struct ndr *n, __u32 *value) { - __u32 ret; + if (n->offset + sizeof(__u32) > n->length) + return 0; - ret = le32_to_cpu(*(__le32 *)ndr_get_field(n)); + if (value) + *value = le32_to_cpu(*(__le32 *)ndr_get_field(n)); n->offset += sizeof(__u32); - return ret; + return 0; } -static __u64 ndr_read_int64(struct ndr *n) +static int ndr_read_int64(struct ndr *n, __u64 *value) { - __u64 ret; + if (n->offset + sizeof(__u64) > n->length) + return -EINVAL; - ret = le64_to_cpu(*(__le64 *)ndr_get_field(n)); + if (value) + *value = le64_to_cpu(*(__le64 *)ndr_get_field(n)); n->offset += sizeof(__u64); - return ret; + return 0; } int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) { char hex_attr[12] = {0}; + int ret; n->offset = 0; n->length = 1024; @@ -136,97 +180,161 @@ int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) if (da->version == 3) { snprintf(hex_attr, 10, "0x%x", da->attr); - ndr_write_string(n, hex_attr); + ret = ndr_write_string(n, hex_attr); } else { - ndr_write_string(n, ""); + ret = ndr_write_string(n, ""); } - ndr_write_int16(n, da->version); - ndr_write_int32(n, da->version); + if (ret) + return ret; + + ret = ndr_write_int16(n, da->version); + if (ret) + return ret; + + ret = ndr_write_int32(n, da->version); + if (ret) + return ret; + + ret = ndr_write_int32(n, da->flags); + if (ret) + return ret; + + ret = ndr_write_int32(n, da->attr); + if (ret) + return ret; - ndr_write_int32(n, da->flags); - ndr_write_int32(n, da->attr); if (da->version == 3) { - ndr_write_int32(n, da->ea_size); - ndr_write_int64(n, da->size); - ndr_write_int64(n, da->alloc_size); + ret = ndr_write_int32(n, da->ea_size); + if (ret) + return ret; + ret = ndr_write_int64(n, da->size); + if (ret) + return ret; + ret = ndr_write_int64(n, da->alloc_size); } else { - ndr_write_int64(n, da->itime); + ret = ndr_write_int64(n, da->itime); } - ndr_write_int64(n, da->create_time); + if (ret) + return ret; + + ret = ndr_write_int64(n, da->create_time); + if (ret) + return ret; + if (da->version == 3) - ndr_write_int64(n, da->change_time); - return 0; + ret = ndr_write_int64(n, da->change_time); + return ret; } int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) { - char *hex_attr; - int version2; - - hex_attr = kzalloc(n->length, GFP_KERNEL); - if (!hex_attr) - return -ENOMEM; + char hex_attr[12]; + unsigned int version2; + int ret; n->offset = 0; - ndr_read_string(n, hex_attr, n->length); - kfree(hex_attr); - da->version = ndr_read_int16(n); + ret = ndr_read_string(n, hex_attr, sizeof(hex_attr)); + if (ret) + return ret; + + ret = ndr_read_int16(n, &da->version); + if (ret) + return ret; if (da->version != 3 && da->version != 4) { pr_err("v%d version is not supported\n", da->version); return -EINVAL; } - version2 = ndr_read_int32(n); + ret = ndr_read_int32(n, &version2); + if (ret) + return ret; + if (da->version != version2) { pr_err("ndr version mismatched(version: %d, version2: %d)\n", da->version, version2); return -EINVAL; } - ndr_read_int32(n); - da->attr = ndr_read_int32(n); + ret = ndr_read_int32(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int32(n, &da->attr); + if (ret) + return ret; + if (da->version == 4) { - da->itime = ndr_read_int64(n); - da->create_time = ndr_read_int64(n); + ret = ndr_read_int64(n, &da->itime); + if (ret) + return ret; + + ret = ndr_read_int64(n, &da->create_time); } else { - ndr_read_int32(n); - ndr_read_int64(n); - ndr_read_int64(n); - da->create_time = ndr_read_int64(n); - ndr_read_int64(n); + ret = ndr_read_int32(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int64(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int64(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int64(n, &da->create_time); + if (ret) + return ret; + + ret = ndr_read_int64(n, NULL); } - return 0; + return ret; } static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl) { - int i; + int i, ret; + + ret = ndr_write_int32(n, acl->count); + if (ret) + return ret; - ndr_write_int32(n, acl->count); n->offset = ALIGN(n->offset, 8); - ndr_write_int32(n, acl->count); - ndr_write_int32(n, 0); + ret = ndr_write_int32(n, acl->count); + if (ret) + return ret; + + ret = ndr_write_int32(n, 0); + if (ret) + return ret; for (i = 0; i < acl->count; i++) { n->offset = ALIGN(n->offset, 8); - ndr_write_int16(n, acl->entries[i].type); - ndr_write_int16(n, acl->entries[i].type); + ret = ndr_write_int16(n, acl->entries[i].type); + if (ret) + return ret; + + ret = ndr_write_int16(n, acl->entries[i].type); + if (ret) + return ret; if (acl->entries[i].type == SMB_ACL_USER) { n->offset = ALIGN(n->offset, 8); - ndr_write_int64(n, acl->entries[i].uid); + ret = ndr_write_int64(n, acl->entries[i].uid); } else if (acl->entries[i].type == SMB_ACL_GROUP) { n->offset = ALIGN(n->offset, 8); - ndr_write_int64(n, acl->entries[i].gid); + ret = ndr_write_int64(n, acl->entries[i].gid); } + if (ret) + return ret; /* push permission */ - ndr_write_int32(n, acl->entries[i].perm); + ret = ndr_write_int32(n, acl->entries[i].perm); } - return 0; + return ret; } int ndr_encode_posix_acl(struct ndr *n, @@ -235,7 +343,8 @@ int ndr_encode_posix_acl(struct ndr *n, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl) { - int ref_id = 0x00020000; + unsigned int ref_id = 0x00020000; + int ret; n->offset = 0; n->length = 1024; @@ -245,35 +354,46 @@ int ndr_encode_posix_acl(struct ndr *n, if (acl) { /* ACL ACCESS */ - ndr_write_int32(n, ref_id); + ret = ndr_write_int32(n, ref_id); ref_id += 4; } else { - ndr_write_int32(n, 0); + ret = ndr_write_int32(n, 0); } + if (ret) + return ret; if (def_acl) { /* DEFAULT ACL ACCESS */ - ndr_write_int32(n, ref_id); + ret = ndr_write_int32(n, ref_id); ref_id += 4; } else { - ndr_write_int32(n, 0); + ret = ndr_write_int32(n, 0); } - - ndr_write_int64(n, from_kuid(user_ns, inode->i_uid)); - ndr_write_int64(n, from_kgid(user_ns, inode->i_gid)); - ndr_write_int32(n, inode->i_mode); + if (ret) + return ret; + + ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode))); + if (ret) + return ret; + ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode))); + if (ret) + return ret; + ret = ndr_write_int32(n, inode->i_mode); + if (ret) + return ret; if (acl) { - ndr_encode_posix_acl_entry(n, acl); - if (def_acl) - ndr_encode_posix_acl_entry(n, def_acl); + ret = ndr_encode_posix_acl_entry(n, acl); + if (def_acl && !ret) + ret = ndr_encode_posix_acl_entry(n, def_acl); } - return 0; + return ret; } int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) { - int ref_id = 0x00020004; + unsigned int ref_id = 0x00020004; + int ret; n->offset = 0; n->length = 2048; @@ -281,36 +401,65 @@ int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (!n->data) return -ENOMEM; - ndr_write_int16(n, acl->version); - ndr_write_int32(n, acl->version); - ndr_write_int16(n, 2); - ndr_write_int32(n, ref_id); + ret = ndr_write_int16(n, acl->version); + if (ret) + return ret; + + ret = ndr_write_int32(n, acl->version); + if (ret) + return ret; + + ret = ndr_write_int16(n, 2); + if (ret) + return ret; + + ret = ndr_write_int32(n, ref_id); + if (ret) + return ret; /* push hash type and hash 64bytes */ - ndr_write_int16(n, acl->hash_type); - ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); - ndr_write_bytes(n, acl->desc, acl->desc_len); - ndr_write_int64(n, acl->current_time); - ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + ret = ndr_write_int16(n, acl->hash_type); + if (ret) + return ret; - /* push ndr for security descriptor */ - ndr_write_bytes(n, acl->sd_buf, acl->sd_size); + ret = ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; - return 0; + ret = ndr_write_bytes(n, acl->desc, acl->desc_len); + if (ret) + return ret; + + ret = ndr_write_int64(n, acl->current_time); + if (ret) + return ret; + + ret = ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; + + /* push ndr for security descriptor */ + ret = ndr_write_bytes(n, acl->sd_buf, acl->sd_size); + return ret; } int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) { - int version2; + unsigned int version2; + int ret; n->offset = 0; - acl->version = ndr_read_int16(n); + ret = ndr_read_int16(n, &acl->version); + if (ret) + return ret; if (acl->version != 4) { pr_err("v%d version is not supported\n", acl->version); return -EINVAL; } - version2 = ndr_read_int32(n); + ret = ndr_read_int32(n, &version2); + if (ret) + return ret; if (acl->version != version2) { pr_err("ndr version mismatched(version: %d, version2: %d)\n", acl->version, version2); @@ -318,11 +467,22 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) } /* Read Level */ - ndr_read_int16(n); + ret = ndr_read_int16(n, NULL); + if (ret) + return ret; + /* Read Ref Id */ - ndr_read_int32(n); - acl->hash_type = ndr_read_int16(n); - ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + ret = ndr_read_int32(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int16(n, &acl->hash_type); + if (ret) + return ret; + + ret = ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; ndr_read_bytes(n, acl->desc, 10); if (strncmp(acl->desc, "posix_acl", 9)) { @@ -331,15 +491,20 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) } /* Read Time */ - ndr_read_int64(n); + ret = ndr_read_int64(n, NULL); + if (ret) + return ret; + /* Read Posix ACL hash */ - ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + ret = ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; + acl->sd_size = n->length - n->offset; acl->sd_buf = kzalloc(acl->sd_size, GFP_KERNEL); if (!acl->sd_buf) return -ENOMEM; - ndr_read_bytes(n, acl->sd_buf, acl->sd_size); - - return 0; + ret = ndr_read_bytes(n, acl->sd_buf, acl->sd_size); + return ret; } diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 6ace6c2f22dc..16b6236d1bd2 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1614,9 +1614,11 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) buf->nlink = cpu_to_le32(inode->i_nlink); buf->reparse_tag = cpu_to_le32(fp->volatile_id); buf->mode = cpu_to_le32(inode->i_mode); - id_to_sid(from_kuid(user_ns, inode->i_uid), + id_to_sid(from_kuid_munged(&init_user_ns, + i_uid_into_mnt(user_ns, inode)), SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]); - id_to_sid(from_kgid(user_ns, inode->i_gid), + id_to_sid(from_kgid_munged(&init_user_ns, + i_gid_into_mnt(user_ns, inode)), SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]); } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index d329ea49fa14..c86164dc70bb 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2381,10 +2381,12 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, le32_to_cpu(sd_buf->ccontext.DataLength), true); } -static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct inode *inode) +static void ksmbd_acls_fattr(struct smb_fattr *fattr, + struct user_namespace *mnt_userns, + struct inode *inode) { - fattr->cf_uid = inode->i_uid; - fattr->cf_gid = inode->i_gid; + fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode); + fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode); fattr->cf_mode = inode->i_mode; fattr->cf_acls = NULL; fattr->cf_dacls = NULL; @@ -2893,7 +2895,7 @@ int smb2_open(struct ksmbd_work *work) struct smb_ntsd *pntsd; int pntsd_size, ace_num = 0; - ksmbd_acls_fattr(&fattr, inode); + ksmbd_acls_fattr(&fattr, user_ns, inode); if (fattr.cf_acls) ace_num = fattr.cf_acls->a_count; if (fattr.cf_dacls) @@ -3324,7 +3326,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level) */ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, struct ksmbd_dir_info *d_info, - struct user_namespace *user_ns, struct ksmbd_kstat *ksmbd_kstat) { int next_entry_offset = 0; @@ -3478,9 +3479,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, S_ISDIR(ksmbd_kstat->kstat->mode) ? ATTR_DIRECTORY_LE : ATTR_ARCHIVE_LE; if (d_info->hide_dot_file && d_info->name[0] == '.') posix_info->DosAttributes |= ATTR_HIDDEN_LE; - id_to_sid(from_kuid(user_ns, ksmbd_kstat->kstat->uid), + id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid), SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); - id_to_sid(from_kgid(user_ns, ksmbd_kstat->kstat->gid), + id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid), SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]); memcpy(posix_info->name, conv_name, conv_len); posix_info->name_len = cpu_to_le32(conv_len); @@ -3543,9 +3544,9 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one_len(priv->d_info->name, - priv->dir_fp->filp->f_path.dentry, - priv->d_info->name_len); + dent = lookup_one(user_ns, priv->d_info->name, + priv->dir_fp->filp->f_path.dentry, + priv->d_info->name_len); unlock_dir(priv->dir_fp); if (IS_ERR(dent)) { @@ -3571,7 +3572,6 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) rc = smb2_populate_readdir_entry(priv->work->conn, priv->info_level, priv->d_info, - user_ns, &ksmbd_kstat); dput(dent); if (rc) @@ -5008,7 +5008,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, user_ns = file_mnt_user_ns(fp->filp); inode = file_inode(fp->filp); - ksmbd_acls_fattr(&fattr, inode); + ksmbd_acls_fattr(&fattr, user_ns, inode); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) @@ -5246,7 +5246,9 @@ int smb2_echo(struct ksmbd_work *work) return 0; } -static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, +static int smb2_rename(struct ksmbd_work *work, + struct ksmbd_file *fp, + struct user_namespace *user_ns, struct smb2_file_rename_info *file_info, struct nls_table *local_nls) { @@ -5310,7 +5312,7 @@ static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, if (rc) goto out; - rc = ksmbd_vfs_setxattr(file_mnt_user_ns(fp->filp), + rc = ksmbd_vfs_setxattr(user_ns, fp->filp->f_path.dentry, xattr_stream_name, NULL, 0, 0); @@ -5438,11 +5440,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, { struct smb2_file_all_info *file_info; struct iattr attrs; - struct iattr temp_attrs; + struct timespec64 ctime; struct file *filp; struct inode *inode; struct user_namespace *user_ns; - int rc; + int rc = 0; if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) return -EACCES; @@ -5462,11 +5464,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, } if (file_info->ChangeTime) { - temp_attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - attrs.ia_ctime = temp_attrs.ia_ctime; + attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); + ctime = attrs.ia_ctime; attrs.ia_valid |= ATTR_CTIME; } else { - temp_attrs.ia_ctime = inode->i_ctime; + ctime = inode->i_ctime; } if (file_info->LastWriteTime) { @@ -5505,13 +5507,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, rc = 0; } - /* - * HACK : set ctime here to avoid ctime changed - * when file_info->ChangeTime is zero. - */ - attrs.ia_ctime = temp_attrs.ia_ctime; - attrs.ia_valid |= ATTR_CTIME; - if (attrs.ia_valid) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = d_inode(dentry); @@ -5519,17 +5514,15 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EACCES; - rc = setattr_prepare(user_ns, dentry, &attrs); - if (rc) - return -EINVAL; - inode_lock(inode); - setattr_copy(user_ns, inode, &attrs); - attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(user_ns, dentry, &attrs, NULL); + if (!rc) { + inode->i_ctime = ctime; + mark_inode_dirty(inode); + } inode_unlock(inode); } - return 0; + return rc; } static int set_file_allocation_info(struct ksmbd_work *work, @@ -5624,6 +5617,7 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, char *buf) { + struct user_namespace *user_ns; struct ksmbd_file *parent_fp; struct dentry *parent; struct dentry *dentry = fp->filp->f_path.dentry; @@ -5634,11 +5628,12 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, return -EACCES; } + user_ns = file_mnt_user_ns(fp->filp); if (ksmbd_stream_fd(fp)) goto next; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(parent, dentry); + ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); if (ret) { dput(parent); return ret; @@ -5655,7 +5650,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, } } next: - return smb2_rename(work, fp, + return smb2_rename(work, fp, user_ns, (struct smb2_file_rename_info *)buf, work->sess->conn->local_nls); } @@ -7116,8 +7111,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, netdev->ethtool_ops->get_link_ksettings(netdev, &cmd); speed = cmd.base.speed; } else { - pr_err("%s %s\n", netdev->name, - "speed is unknown, defaulting to 1Gb/sec"); + ksmbd_debug(SMB, "%s %s\n", netdev->name, + "speed is unknown, defaulting to 1Gb/sec"); speed = SPEED_1000; } diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index b108b918ec84..43d3123d8b62 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -291,7 +291,6 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, char *search_pattern, int (*fn)(struct ksmbd_conn *, int, struct ksmbd_dir_info *, - struct user_namespace *, struct ksmbd_kstat *)) { int i, rc = 0; @@ -322,8 +321,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, user_ns, dir->filp->f_path.dentry->d_parent, &ksmbd_kstat); - rc = fn(conn, info_level, d_info, - user_ns, &ksmbd_kstat); + rc = fn(conn, info_level, d_info, &ksmbd_kstat); if (rc) break; if (d_info->out_buf_len <= 0) diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index eb667d85558e..57c667c1be06 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -511,7 +511,6 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int (*fn)(struct ksmbd_conn *, int, struct ksmbd_dir_info *, - struct user_namespace *, struct ksmbd_kstat *)); int ksmbd_extract_shortname(struct ksmbd_conn *conn, diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 5456e3ad943e..0a95cdec8c80 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -274,24 +274,34 @@ static int sid_to_id(struct user_namespace *user_ns, uid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - if (id > 0) { - uid = make_kuid(user_ns, id); - if (uid_valid(uid) && kuid_has_mapping(user_ns, uid)) { - fattr->cf_uid = uid; - rc = 0; - } + /* + * Translate raw sid into kuid in the server's user + * namespace. + */ + uid = make_kuid(&init_user_ns, id); + + /* If this is an idmapped mount, apply the idmapping. */ + uid = kuid_from_mnt(user_ns, uid); + if (uid_valid(uid)) { + fattr->cf_uid = uid; + rc = 0; } } else { kgid_t gid; gid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - if (id > 0) { - gid = make_kgid(user_ns, id); - if (gid_valid(gid) && kgid_has_mapping(user_ns, gid)) { - fattr->cf_gid = gid; - rc = 0; - } + /* + * Translate raw sid into kgid in the server's user + * namespace. + */ + gid = make_kgid(&init_user_ns, id); + + /* If this is an idmapped mount, apply the idmapping. */ + gid = kgid_from_mnt(user_ns, gid); + if (gid_valid(gid)) { + fattr->cf_gid = gid; + rc = 0; } } @@ -587,14 +597,14 @@ static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, uid_t uid; unsigned int sid_type = SIDOWNER; - uid = from_kuid(user_ns, pace->e_uid); + uid = posix_acl_uid_translate(user_ns, pace); if (!uid) sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = from_kgid(user_ns, pace->e_gid); + gid = posix_acl_gid_translate(user_ns, pace); id_to_sid(gid, SIDUNIX_GROUP, sid); } else if (pace->e_tag == ACL_OTHER && !nt_aces_num) { smb_copy_sid(sid, &sid_everyone); @@ -653,12 +663,12 @@ posix_default_acl: if (pace->e_tag == ACL_USER) { uid_t uid; - uid = from_kuid(user_ns, pace->e_uid); + uid = posix_acl_uid_translate(user_ns, pace); id_to_sid(uid, SIDCREATOR_OWNER, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = from_kgid(user_ns, pace->e_gid); + gid = posix_acl_gid_translate(user_ns, pace); id_to_sid(gid, SIDCREATOR_GROUP, sid); } else { kfree(sid); @@ -723,7 +733,7 @@ static void set_mode_dacl(struct user_namespace *user_ns, } /* owner RID */ - uid = from_kuid(user_ns, fattr->cf_uid); + uid = from_kuid(&init_user_ns, fattr->cf_uid); if (uid) sid = &server_conf.domain_sid; else @@ -739,7 +749,7 @@ static void set_mode_dacl(struct user_namespace *user_ns, ace_size = fill_ace_for_sid(pace, &sid_unix_groups, ACCESS_ALLOWED, 0, fattr->cf_mode, 0070); pace->sid.sub_auth[pace->sid.num_subauth++] = - cpu_to_le32(from_kgid(user_ns, fattr->cf_gid)); + cpu_to_le32(from_kgid(&init_user_ns, fattr->cf_gid)); pace->size = cpu_to_le16(ace_size + 4); size += le16_to_cpu(pace->size); pace = (struct smb_ace *)((char *)pndace + size); @@ -880,7 +890,7 @@ int build_sec_desc(struct user_namespace *user_ns, if (!nowner_sid_ptr) return -ENOMEM; - uid = from_kuid(user_ns, fattr->cf_uid); + uid = from_kuid(&init_user_ns, fattr->cf_uid); if (!uid) sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, nowner_sid_ptr); @@ -891,7 +901,7 @@ int build_sec_desc(struct user_namespace *user_ns, return -ENOMEM; } - gid = from_kgid(user_ns, fattr->cf_gid); + gid = from_kgid(&init_user_ns, fattr->cf_gid); id_to_sid(gid, SIDUNIX_GROUP, ngroup_sid_ptr); offset = sizeof(struct smb_ntsd); @@ -1234,11 +1244,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, pa_entry = posix_acls->a_entries; for (i = 0; i < posix_acls->a_count; i++, pa_entry++) { if (pa_entry->e_tag == ACL_USER) - id = from_kuid(user_ns, - pa_entry->e_uid); + id = posix_acl_uid_translate(user_ns, pa_entry); else if (pa_entry->e_tag == ACL_GROUP) - id = from_kgid(user_ns, - pa_entry->e_gid); + id = posix_acl_gid_translate(user_ns, pa_entry); else continue; @@ -1322,22 +1330,31 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - inode_lock(inode); - rc = notify_change(user_ns, path->dentry, &newattrs, NULL); - inode_unlock(inode); - if (rc) - goto out; - ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, fattr.cf_acls); - if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) + if (rc < 0) + ksmbd_debug(SMB, + "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", + rc); + if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, fattr.cf_dacls); + if (rc) + ksmbd_debug(SMB, + "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", + rc); + } } + inode_lock(inode); + rc = notify_change(user_ns, path->dentry, &newattrs, NULL); + inode_unlock(inode); + if (rc) + goto out; + /* Check it only calling from SD BUFFER context */ if (type_check && !(le16_to_cpu(pntsd->type) & DACL_PRESENT)) goto out; diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index 940f686a1d95..73e08cad412b 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -209,4 +209,29 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, bool type_check); void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); + +static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, + struct posix_acl_entry *pace) +{ + kuid_t kuid; + + /* If this is an idmapped mount, apply the idmapping. */ + kuid = kuid_into_mnt(mnt_userns, pace->e_uid); + + /* Translate the kuid into a userspace id ksmbd would see. */ + return from_kuid(&init_user_ns, kuid); +} + +static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, + struct posix_acl_entry *pace) +{ + kgid_t kgid; + + /* If this is an idmapped mount, apply the idmapping. */ + kgid = kgid_into_mnt(mnt_userns, pace->e_gid); + + /* Translate the kgid into a userspace id ksmbd would see. */ + return from_kgid(&init_user_ns, kgid); +} + #endif /* _SMBACL_H */ diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 58f530056ac0..52b2556e76b1 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -1168,7 +1168,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, pr_err("failed to map buffer\n"); ret = -ENOMEM; goto err; - } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES - 1) { + } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) { pr_err("buffer not fitted into sges\n"); ret = -E2BIG; ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt, diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index aee28ee6b19c..b047f2980d96 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -69,14 +69,15 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, * * the reference count of @parent isn't incremented. */ -int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) +int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, + struct dentry *child) { struct dentry *dentry; int ret = 0; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - dentry = lookup_one_len(child->d_name.name, parent, - child->d_name.len); + dentry = lookup_one(user_ns, child->d_name.name, parent, + child->d_name.len); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_err; @@ -102,7 +103,7 @@ int ksmbd_vfs_may_delete(struct user_namespace *user_ns, int ret; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(parent, dentry); + ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); if (ret) { dput(parent); return ret; @@ -137,7 +138,7 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, *daccess |= FILE_EXECUTE_LE; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(parent, dentry); + ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); if (ret) { dput(parent); return ret; @@ -197,6 +198,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) */ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { + struct user_namespace *user_ns; struct path path; struct dentry *dentry; int err; @@ -210,16 +212,16 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + user_ns = mnt_user_ns(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(mnt_user_ns(path.mnt), d_inode(path.dentry), - dentry, mode); + err = vfs_mkdir(user_ns, d_inode(path.dentry), dentry, mode); if (err) { goto out; } else if (d_unhashed(dentry)) { struct dentry *d; - d = lookup_one_len(dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); + d = lookup_one(user_ns, dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); goto out; @@ -582,6 +584,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) */ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) { + struct user_namespace *user_ns; struct path path; struct dentry *parent; int err; @@ -601,8 +604,9 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) return err; } + user_ns = mnt_user_ns(path.mnt); parent = dget_parent(path.dentry); - err = ksmbd_vfs_lock_parent(parent, path.dentry); + err = ksmbd_vfs_lock_parent(user_ns, parent, path.dentry); if (err) { dput(parent); path_put(&path); @@ -616,14 +620,12 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) } if (S_ISDIR(d_inode(path.dentry)->i_mode)) { - err = vfs_rmdir(mnt_user_ns(path.mnt), d_inode(parent), - path.dentry); + err = vfs_rmdir(user_ns, d_inode(parent), path.dentry); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, err); } else { - err = vfs_unlink(mnt_user_ns(path.mnt), d_inode(parent), - path.dentry, NULL); + err = vfs_unlink(user_ns, d_inode(parent), path.dentry, NULL); if (err) ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, err); @@ -748,7 +750,8 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, if (ksmbd_override_fsids(work)) return -ENOMEM; - dst_dent = lookup_one_len(dst_name, dst_dent_parent, strlen(dst_name)); + dst_dent = lookup_one(dst_user_ns, dst_name, dst_dent_parent, + strlen(dst_name)); err = PTR_ERR(dst_dent); if (IS_ERR(dst_dent)) { pr_err("lookup failed %s [%d]\n", dst_name, err); @@ -779,6 +782,7 @@ out: int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname) { + struct user_namespace *user_ns; struct path dst_path; struct dentry *src_dent_parent, *dst_dent_parent; struct dentry *src_dent, *trap_dent, *src_child; @@ -808,8 +812,9 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, trap_dent = lock_rename(src_dent_parent, dst_dent_parent); dget(src_dent); dget(dst_dent_parent); - src_child = lookup_one_len(src_dent->d_name.name, src_dent_parent, - src_dent->d_name.len); + user_ns = file_mnt_user_ns(fp->filp); + src_child = lookup_one(user_ns, src_dent->d_name.name, src_dent_parent, + src_dent->d_name.len); if (IS_ERR(src_child)) { err = PTR_ERR(src_child); goto out_lock; @@ -823,7 +828,7 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, dput(src_child); err = __ksmbd_vfs_rename(work, - file_mnt_user_ns(fp->filp), + user_ns, src_dent_parent, src_dent, mnt_user_ns(dst_path.mnt), @@ -1109,7 +1114,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns, { int err = 0; - err = ksmbd_vfs_lock_parent(dir, dentry); + err = ksmbd_vfs_lock_parent(user_ns, dir, dentry); if (err) return err; dget(dentry); @@ -1385,14 +1390,14 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac switch (pa_entry->e_tag) { case ACL_USER: xa_entry->type = SMB_ACL_USER; - xa_entry->uid = from_kuid(user_ns, pa_entry->e_uid); + xa_entry->uid = posix_acl_uid_translate(user_ns, pa_entry); break; case ACL_USER_OBJ: xa_entry->type = SMB_ACL_USER_OBJ; break; case ACL_GROUP: xa_entry->type = SMB_ACL_GROUP; - xa_entry->gid = from_kgid(user_ns, pa_entry->e_gid); + xa_entry->gid = posix_acl_gid_translate(user_ns, pa_entry); break; case ACL_GROUP_OBJ: xa_entry->type = SMB_ACL_GROUP_OBJ; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index cb0cba0d5d07..85db50abdb24 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -107,7 +107,8 @@ struct ksmbd_kstat { __le32 file_attributes; }; -int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child); +int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, + struct dentry *child); int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry); int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, struct dentry *dentry, __le32 *daccess); diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 92d8c61ffd2a..29c1db66bd0f 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -666,22 +666,6 @@ void ksmbd_free_global_file_table(void) ksmbd_destroy_file_table(&global_ft); } -int ksmbd_file_table_flush(struct ksmbd_work *work) -{ - struct ksmbd_file *fp = NULL; - unsigned int id; - int ret; - - read_lock(&work->sess->file_table.lock); - idr_for_each_entry(work->sess->file_table.idr, fp, id) { - ret = ksmbd_vfs_fsync(work, fp->volatile_id, KSMBD_NO_FID); - if (ret) - break; - } - read_unlock(&work->sess->file_table.lock); - return ret; -} - int ksmbd_init_file_table(struct ksmbd_file_table *ft) { ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL); diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 70dfe6a99f13..448576fbe4b7 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -152,7 +152,6 @@ void ksmbd_close_session_fds(struct ksmbd_work *work); int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); int ksmbd_init_global_file_table(void); void ksmbd_free_global_file_table(void); -int ksmbd_file_table_flush(struct ksmbd_work *work); void ksmbd_set_fd_limit(unsigned long limit); /* diff --git a/fs/namei.c b/fs/namei.c index 95a881e0552b..1946d9667790 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -255,7 +255,7 @@ getname_kernel(const char * filename) void putname(struct filename *name) { - if (IS_ERR_OR_NULL(name)) + if (IS_ERR(name)) return; BUG_ON(name->refcnt <= 0); @@ -2467,7 +2467,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path return err; } -static int __filename_lookup(int dfd, struct filename *name, unsigned flags, +int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; @@ -2488,15 +2488,6 @@ static int __filename_lookup(int dfd, struct filename *name, unsigned flags, return retval; } -int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) -{ - int retval = __filename_lookup(dfd, name, flags, path, root); - - putname(name); - return retval; -} - /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) @@ -2514,9 +2505,10 @@ static int path_parentat(struct nameidata *nd, unsigned flags, return err; } -static int __filename_parentat(int dfd, struct filename *name, - unsigned int flags, struct path *parent, - struct qstr *last, int *type) +/* Note: this does not consume "name" */ +static int filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) { int retval; struct nameidata nd; @@ -2538,25 +2530,14 @@ static int __filename_parentat(int dfd, struct filename *name, return retval; } -static int filename_parentat(int dfd, struct filename *name, - unsigned int flags, struct path *parent, - struct qstr *last, int *type) -{ - int retval = __filename_parentat(dfd, name, flags, parent, last, type); - - putname(name); - return retval; -} - /* does lookup, returns the object with parent locked */ -struct dentry *kern_path_locked(const char *name, struct path *path) +static struct dentry *__kern_path_locked(struct filename *name, struct path *path) { struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path, - &last, &type); + error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { @@ -2572,10 +2553,23 @@ struct dentry *kern_path_locked(const char *name, struct path *path) return d; } +struct dentry *kern_path_locked(const char *name, struct path *path) +{ + struct filename *filename = getname_kernel(name); + struct dentry *res = __kern_path_locked(filename, path); + + putname(filename); + return res; +} + int kern_path(const char *name, unsigned int flags, struct path *path) { - return filename_lookup(AT_FDCWD, getname_kernel(name), - flags, path, NULL); + struct filename *filename = getname_kernel(name); + int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); + + putname(filename); + return ret; + } EXPORT_SYMBOL(kern_path); @@ -2591,10 +2585,15 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { + struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; + int ret; + + filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ - return filename_lookup(AT_FDCWD, getname_kernel(name), - flags , path, &root); + ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); + putname(filename); + return ret; } EXPORT_SYMBOL(vfs_path_lookup); @@ -2798,8 +2797,11 @@ int path_pts(struct path *path) int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { - return filename_lookup(dfd, getname_flags(name, flags, empty), - flags, path, NULL); + struct filename *filename = getname_flags(name, flags, empty); + int ret = filename_lookup(dfd, filename, flags, path, NULL); + + putname(filename); + return ret; } EXPORT_SYMBOL(user_path_at_empty); @@ -3618,8 +3620,8 @@ struct file *do_file_open_root(const struct path *root, return file; } -static struct dentry *__filename_create(int dfd, struct filename *name, - struct path *path, unsigned int lookup_flags) +static struct dentry *filename_create(int dfd, struct filename *name, + struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; @@ -3634,7 +3636,7 @@ static struct dentry *__filename_create(int dfd, struct filename *name, */ lookup_flags &= LOOKUP_REVAL; - error = __filename_parentat(dfd, name, lookup_flags, path, &last, &type); + error = filename_parentat(dfd, name, lookup_flags, path, &last, &type); if (error) return ERR_PTR(error); @@ -3687,21 +3689,15 @@ out: return dentry; } -static inline struct dentry *filename_create(int dfd, struct filename *name, +struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { - struct dentry *res = __filename_create(dfd, name, path, lookup_flags); + struct filename *filename = getname_kernel(pathname); + struct dentry *res = filename_create(dfd, filename, path, lookup_flags); - putname(name); + putname(filename); return res; } - -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) -{ - return filename_create(dfd, getname_kernel(pathname), - path, lookup_flags); -} EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) @@ -3716,7 +3712,11 @@ EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { - return filename_create(dfd, getname(pathname), path, lookup_flags); + struct filename *filename = getname(pathname); + struct dentry *res = filename_create(dfd, filename, path, lookup_flags); + + putname(filename); + return res; } EXPORT_SYMBOL(user_path_create); @@ -3797,7 +3797,7 @@ static int do_mknodat(int dfd, struct filename *name, umode_t mode, if (error) goto out1; retry: - dentry = __filename_create(dfd, name, &path, lookup_flags); + dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out1; @@ -3897,7 +3897,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: - dentry = __filename_create(dfd, name, &path, lookup_flags); + dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; @@ -3996,7 +3996,7 @@ int do_rmdir(int dfd, struct filename *name) int type; unsigned int lookup_flags = 0; retry: - error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type); + error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; @@ -4137,7 +4137,7 @@ int do_unlinkat(int dfd, struct filename *name) struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type); + error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; @@ -4266,7 +4266,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to) goto out_putnames; } retry: - dentry = __filename_create(newdfd, to, &path, lookup_flags); + dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putnames; @@ -4426,11 +4426,11 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: - error = __filename_lookup(olddfd, old, how, &old_path, NULL); + error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) goto out_putnames; - new_dentry = __filename_create(newdfd, new, &new_path, + new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) @@ -4689,13 +4689,13 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, target_flags = 0; retry: - error = __filename_parentat(olddfd, from, lookup_flags, &old_path, - &old_last, &old_type); + error = filename_parentat(olddfd, from, lookup_flags, &old_path, + &old_last, &old_type); if (error) goto put_names; - error = __filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, - &new_type); + error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, + &new_type); if (error) goto exit1; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 95006d1d29ab..fa1d99101f89 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -531,6 +531,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, /* Someone else created list structure for us */ if (inode) fsnotify_put_inode_ref(inode); + fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } diff --git a/fs/cifs_common/Makefile b/fs/smbfs_common/Makefile index 6fedd2f88a25..cafc61a3bfc3 100644 --- a/fs/cifs_common/Makefile +++ b/fs/smbfs_common/Makefile @@ -3,5 +3,5 @@ # Makefile for Linux filesystem routines that are shared by client and server. # -obj-$(CONFIG_CIFS_COMMON) += cifs_arc4.o -obj-$(CONFIG_CIFS_COMMON) += cifs_md4.o +obj-$(CONFIG_SMBFS_COMMON) += cifs_arc4.o +obj-$(CONFIG_SMBFS_COMMON) += cifs_md4.o diff --git a/fs/cifs_common/arc4.h b/fs/smbfs_common/arc4.h index 12e71ec033a1..12e71ec033a1 100644 --- a/fs/cifs_common/arc4.h +++ b/fs/smbfs_common/arc4.h diff --git a/fs/cifs_common/cifs_arc4.c b/fs/smbfs_common/cifs_arc4.c index b964cc682944..85ba15a60b13 100644 --- a/fs/cifs_common/cifs_arc4.c +++ b/fs/smbfs_common/cifs_arc4.c @@ -74,14 +74,14 @@ void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int l EXPORT_SYMBOL_GPL(cifs_arc4_crypt); static int __init -init_cifs_common(void) +init_smbfs_common(void) { return 0; } static void __init -exit_cifs_common(void) +exit_smbfs_common(void) { } -module_init(init_cifs_common) -module_exit(exit_cifs_common) +module_init(init_smbfs_common) +module_exit(exit_smbfs_common) diff --git a/fs/cifs_common/cifs_md4.c b/fs/smbfs_common/cifs_md4.c index 50f78cfc6ce9..50f78cfc6ce9 100644 --- a/fs/cifs_common/cifs_md4.c +++ b/fs/smbfs_common/cifs_md4.c diff --git a/fs/cifs_common/md4.h b/fs/smbfs_common/md4.h index 5337becc699a..5337becc699a 100644 --- a/fs/cifs_common/md4.h +++ b/fs/smbfs_common/md4.h diff --git a/fs/cifs/smbfsctl.h b/fs/smbfs_common/smbfsctl.h index d0fc42061f49..d01e8c9d7a31 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/smbfs_common/smbfsctl.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: LGPL-2.1 */ +/* SPDX-License-Identifier: LGPL-2.1+ */ /* * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions * @@ -19,11 +19,14 @@ * could be invoked from tools via a specialized hook into the VFS rather * than via the standard vfs entry points * - * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are + * See MS-SMB2 Section 2.2.31 (last checked September 2021, all of that list are * below). Additional detail on less common ones can be found in MS-FSCC * section 2.3. */ +#ifndef __SMBFSCTL_H +#define __SMBFSCTL_H + /* * FSCTL values are 32 bits and are constructed as * <device 16bits> <access 2bits> <function 12bits> <method 2bits> @@ -91,6 +94,7 @@ #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C +#define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */ #define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 #define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF @@ -146,7 +150,13 @@ #define IO_REPARSE_TAG_LX_CHR 0x80000025 #define IO_REPARSE_TAG_LX_BLK 0x80000026 +#define IO_REPARSE_TAG_LX_SYMLINK_LE cpu_to_le32(0xA000001D) +#define IO_REPARSE_TAG_AF_UNIX_LE cpu_to_le32(0x80000023) +#define IO_REPARSE_TAG_LX_FIFO_LE cpu_to_le32(0x80000024) +#define IO_REPARSE_TAG_LX_CHR_LE cpu_to_le32(0x80000025) +#define IO_REPARSE_TAG_LX_BLK_LE cpu_to_le32(0x80000026) + /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 - +#endif /* __SMBFSCTL_H */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9a86d3ec2cb6..c4e0cd1c1c8c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -330,6 +330,15 @@ xfs_set_inode_alloc( return xfs_is_inode32(mp) ? maxagi : agcount; } +static bool +xfs_buftarg_is_dax( + struct super_block *sb, + struct xfs_buftarg *bt) +{ + return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, + bdev_nr_sectors(bt->bt_bdev)); +} + STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -1588,11 +1597,10 @@ xfs_fs_fill_super( xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, - sb->s_blocksize); + datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp); if (mp->m_rtdev_targp) - rtdev_is_dax = bdev_dax_supported( - mp->m_rtdev_targp->bt_bdev, sb->s_blocksize); + rtdev_is_dax = xfs_buftarg_is_dax(sb, + mp->m_rtdev_targp); if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); |