aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c14
-rw-r--r--fs/adfs/inode.c5
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.h3
-rw-r--r--fs/affs/dir.c44
-rw-r--r--fs/affs/file.c22
-rw-r--r--fs/afs/file.c29
-rw-r--r--fs/afs/fsclient.c9
-rw-r--r--fs/afs/inode.c11
-rw-r--r--fs/afs/write.c4
-rw-r--r--fs/afs/yfsclient.c9
-rw-r--r--fs/aio.c2
-rw-r--r--fs/attr.c14
-rw-r--r--fs/autofs/autofs_i.h4
-rw-r--r--fs/autofs/dev-ioctl.c97
-rw-r--r--fs/autofs/expire.c7
-rw-r--r--fs/autofs/inode.c5
-rw-r--r--fs/backing-file.c5
-rw-r--r--fs/bcachefs/acl.c11
-rw-r--r--fs/bcachefs/acl.h2
-rw-r--r--fs/bcachefs/alloc_background.c159
-rw-r--r--fs/bcachefs/alloc_background.h42
-rw-r--r--fs/bcachefs/alloc_background_format.h1
-rw-r--r--fs/bcachefs/alloc_foreground.c34
-rw-r--r--fs/bcachefs/alloc_foreground.h9
-rw-r--r--fs/bcachefs/backpointers.c23
-rw-r--r--fs/bcachefs/backpointers.h5
-rw-r--r--fs/bcachefs/bcachefs.h3
-rw-r--r--fs/bcachefs/bcachefs_format.h7
-rw-r--r--fs/bcachefs/bkey.h7
-rw-r--r--fs/bcachefs/bkey_methods.c109
-rw-r--r--fs/bcachefs/bkey_methods.h21
-rw-r--r--fs/bcachefs/btree_cache.c25
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_gc.c5
-rw-r--r--fs/bcachefs/btree_io.c69
-rw-r--r--fs/bcachefs/btree_iter.c6
-rw-r--r--fs/bcachefs/btree_iter.h51
-rw-r--r--fs/bcachefs/btree_journal_iter.c2
-rw-r--r--fs/bcachefs/btree_key_cache.c36
-rw-r--r--fs/bcachefs/btree_key_cache.h18
-rw-r--r--fs/bcachefs/btree_node_scan.c2
-rw-r--r--fs/bcachefs/btree_trans_commit.c82
-rw-r--r--fs/bcachefs/btree_update_interior.c64
-rw-r--r--fs/bcachefs/buckets.c123
-rw-r--r--fs/bcachefs/buckets.h2
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c15
-rw-r--r--fs/bcachefs/data_update.c216
-rw-r--r--fs/bcachefs/debug.c38
-rw-r--r--fs/bcachefs/dirent.c33
-rw-r--r--fs/bcachefs/dirent.h5
-rw-r--r--fs/bcachefs/disk_accounting.c89
-rw-r--r--fs/bcachefs/disk_accounting.h60
-rw-r--r--fs/bcachefs/disk_accounting_format.h23
-rw-r--r--fs/bcachefs/ec.c49
-rw-r--r--fs/bcachefs/ec.h9
-rw-r--r--fs/bcachefs/errcode.h2
-rw-r--r--fs/bcachefs/error.c22
-rw-r--r--fs/bcachefs/error.h39
-rw-r--r--fs/bcachefs/extents.c234
-rw-r--r--fs/bcachefs/extents.h48
-rw-r--r--fs/bcachefs/fs-io-buffered.c159
-rw-r--r--fs/bcachefs/fs-io-buffered.h6
-rw-r--r--fs/bcachefs/fs-ioctl.c3
-rw-r--r--fs/bcachefs/fs.c28
-rw-r--r--fs/bcachefs/fs.h7
-rw-r--r--fs/bcachefs/fsck.c24
-rw-r--r--fs/bcachefs/inode.c77
-rw-r--r--fs/bcachefs/inode.h24
-rw-r--r--fs/bcachefs/io_misc.c6
-rw-r--r--fs/bcachefs/io_read.c1
-rw-r--r--fs/bcachefs/io_write.c5
-rw-r--r--fs/bcachefs/journal.c2
-rw-r--r--fs/bcachefs/journal_io.c24
-rw-r--r--fs/bcachefs/journal_sb.c15
-rw-r--r--fs/bcachefs/lru.c9
-rw-r--r--fs/bcachefs/lru.h5
-rw-r--r--fs/bcachefs/movinggc.c2
-rw-r--r--fs/bcachefs/opts.h5
-rw-r--r--fs/bcachefs/quota.c8
-rw-r--r--fs/bcachefs/quota.h5
-rw-r--r--fs/bcachefs/recovery.c9
-rw-r--r--fs/bcachefs/reflink.c19
-rw-r--r--fs/bcachefs/reflink.h22
-rw-r--r--fs/bcachefs/replicas.c9
-rw-r--r--fs/bcachefs/sb-downgrade.c37
-rw-r--r--fs/bcachefs/sb-errors_format.h8
-rw-r--r--fs/bcachefs/sb-members.c3
-rw-r--r--fs/bcachefs/sb-members_format.h5
-rw-r--r--fs/bcachefs/snapshot.c42
-rw-r--r--fs/bcachefs/snapshot.h11
-rw-r--r--fs/bcachefs/subvolume.c16
-rw-r--r--fs/bcachefs/subvolume.h5
-rw-r--r--fs/bcachefs/super-io.c4
-rw-r--r--fs/bcachefs/super.c1
-rw-r--r--fs/bcachefs/sysfs.c8
-rw-r--r--fs/bcachefs/trace.c1
-rw-r--r--fs/bcachefs/trace.h27
-rw-r--r--fs/bcachefs/util.c1
-rw-r--r--fs/bcachefs/xattr.c33
-rw-r--r--fs/bcachefs/xattr.h5
-rw-r--r--fs/bfs/file.c4
-rw-r--r--fs/binfmt_elf.c57
-rw-r--r--fs/binfmt_elf_fdpic.c3
-rw-r--r--fs/binfmt_flat.c4
-rw-r--r--fs/btrfs/backref.c6
-rw-r--r--fs/btrfs/bio.c102
-rw-r--r--fs/btrfs/bio.h6
-rw-r--r--fs/btrfs/block-group.c34
-rw-r--r--fs/btrfs/block-group.h11
-rw-r--r--fs/btrfs/block-rsv.c2
-rw-r--r--fs/btrfs/block-rsv.h2
-rw-r--r--fs/btrfs/btrfs_inode.h24
-rw-r--r--fs/btrfs/compression.c82
-rw-r--r--fs/btrfs/compression.h16
-rw-r--r--fs/btrfs/ctree.c18
-rw-r--r--fs/btrfs/ctree.h12
-rw-r--r--fs/btrfs/defrag.c97
-rw-r--r--fs/btrfs/defrag.h3
-rw-r--r--fs/btrfs/delayed-ref.c103
-rw-r--r--fs/btrfs/delayed-ref.h6
-rw-r--r--fs/btrfs/dev-replace.c43
-rw-r--r--fs/btrfs/direct-io.c89
-rw-r--r--fs/btrfs/discard.c4
-rw-r--r--fs/btrfs/disk-io.c16
-rw-r--r--fs/btrfs/extent-io-tree.c55
-rw-r--r--fs/btrfs/extent-io-tree.h38
-rw-r--r--fs/btrfs/extent-tree.c55
-rw-r--r--fs/btrfs/extent_io.c861
-rw-r--r--fs/btrfs/extent_io.h12
-rw-r--r--fs/btrfs/extent_map.c31
-rw-r--r--fs/btrfs/fiemap.c2
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file-item.h2
-rw-r--r--fs/btrfs/file.c38
-rw-r--r--fs/btrfs/free-space-cache.c14
-rw-r--r--fs/btrfs/fs.h2
-rw-r--r--fs/btrfs/inode-item.c10
-rw-r--r--fs/btrfs/inode-item.h4
-rw-r--r--fs/btrfs/inode.c383
-rw-r--r--fs/btrfs/ioctl.c11
-rw-r--r--fs/btrfs/lzo.c12
-rw-r--r--fs/btrfs/ordered-data.c30
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/orphan.c24
-rw-r--r--fs/btrfs/print-tree.c2
-rw-r--r--fs/btrfs/qgroup.c71
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid-stripe-tree.c46
-rw-r--r--fs/btrfs/reflink.c35
-rw-r--r--fs/btrfs/relocation.c22
-rw-r--r--fs/btrfs/scrub.c37
-rw-r--r--fs/btrfs/send.c58
-rw-r--r--fs/btrfs/space-info.c38
-rw-r--r--fs/btrfs/space-info.h10
-rw-r--r--fs/btrfs/subpage.c277
-rw-r--r--fs/btrfs/subpage.h60
-rw-r--r--fs/btrfs/super.c23
-rw-r--r--fs/btrfs/tests/extent-io-tests.c10
-rw-r--r--fs/btrfs/transaction.c5
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-checker.c74
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/tree-mod-log.c14
-rw-r--r--fs/btrfs/tree-mod-log.h6
-rw-r--r--fs/btrfs/uuid-tree.c179
-rw-r--r--fs/btrfs/uuid-tree.h2
-rw-r--r--fs/btrfs/verity.c20
-rw-r--r--fs/btrfs/volumes.c228
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c33
-rw-r--r--fs/btrfs/zoned.c66
-rw-r--r--fs/btrfs/zoned.h4
-rw-r--r--fs/btrfs/zstd.c35
-rw-r--r--fs/buffer.c71
-rw-r--r--fs/cachefiles/io.c19
-rw-r--r--fs/cachefiles/xattr.c34
-rw-r--r--fs/ceph/addr.c117
-rw-r--r--fs/ceph/dir.c1
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/coda/inode.c43
-rw-r--r--fs/coredump.c166
-rw-r--r--fs/dcache.c19
-rw-r--r--fs/debugfs/inode.c8
-rw-r--r--fs/direct-io.c6
-rw-r--r--fs/ecryptfs/mmap.c86
-rw-r--r--fs/erofs/Kconfig22
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/data.c109
-rw-r--r--fs/erofs/decompressor.c2
-rw-r--r--fs/erofs/dir.c35
-rw-r--r--fs/erofs/erofs_fs.h5
-rw-r--r--fs/erofs/fileio.c192
-rw-r--r--fs/erofs/inode.c152
-rw-r--r--fs/erofs/internal.h28
-rw-r--r--fs/erofs/super.c106
-rw-r--r--fs/erofs/sysfs.c30
-rw-r--r--fs/erofs/zdata.c196
-rw-r--r--fs/erofs/zmap.c42
-rw-r--r--fs/erofs/zutil.c3
-rw-r--r--fs/eventpoll.c7
-rw-r--r--fs/exec.c39
-rw-r--r--fs/exfat/file.c8
-rw-r--r--fs/exfat/inode.c9
-rw-r--r--fs/ext2/dir.c32
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext4/dir.c50
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/inline.c21
-rw-r--r--fs/ext4/inode.c37
-rw-r--r--fs/ext4/verity.c8
-rw-r--r--fs/f2fs/data.c87
-rw-r--r--fs/f2fs/super.c8
-rw-r--r--fs/f2fs/verity.c8
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/fcntl.c186
-rw-r--r--fs/fhandle.c29
-rw-r--r--fs/file.c32
-rw-r--r--fs/file_table.c26
-rw-r--r--fs/fs-writeback.c67
-rw-r--r--fs/fuse/dev.c20
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c55
-rw-r--r--fs/fuse/inode.c7
-rw-r--r--fs/fuse/xattr.c4
-rw-r--r--fs/hfs/extent.c6
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c5
-rw-r--r--fs/hfsplus/extents.c6
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c5
-rw-r--r--fs/hostfs/hostfs_kern.c23
-rw-r--r--fs/hpfs/file.c9
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c135
-rw-r--r--fs/internal.h1
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/jffs2/file.c88
-rw-r--r--fs/jffs2/gc.c25
-rw-r--r--fs/jfs/inode.c8
-rw-r--r--fs/jfs/jfs_discard.c11
-rw-r--r--fs/jfs/jfs_dmap.c11
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/xattr.c2
-rw-r--r--fs/libfs.c82
-rw-r--r--fs/locks.c8
-rw-r--r--fs/minix/dir.c134
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/minix/minix.h40
-rw-r--r--fs/minix/namei.c32
-rw-r--r--fs/mnt_idmapping.c12
-rw-r--r--fs/mount.h14
-rw-r--r--fs/namei.c85
-rw-r--r--fs/namespace.c92
-rw-r--r--fs/netfs/Kconfig2
-rw-r--r--fs/netfs/Makefile4
-rw-r--r--fs/netfs/buffered_read.c699
-rw-r--r--fs/netfs/buffered_write.c311
-rw-r--r--fs/netfs/direct_read.c147
-rw-r--r--fs/netfs/fscache_cookie.c4
-rw-r--r--fs/netfs/fscache_main.c1
-rw-r--r--fs/netfs/internal.h43
-rw-r--r--fs/netfs/io.c647
-rw-r--r--fs/netfs/iterator.c50
-rw-r--r--fs/netfs/locking.c22
-rw-r--r--fs/netfs/main.c11
-rw-r--r--fs/netfs/misc.c154
-rw-r--r--fs/netfs/objects.c26
-rw-r--r--fs/netfs/read_collect.c544
-rw-r--r--fs/netfs/read_pgpriv2.c264
-rw-r--r--fs/netfs/read_retry.c256
-rw-r--r--fs/netfs/stats.c27
-rw-r--r--fs/netfs/write_collect.c253
-rw-r--r--fs/netfs/write_issue.c95
-rw-r--r--fs/nfs/callback_xdr.c6
-rw-r--r--fs/nfs/delegation.c15
-rw-r--r--fs/nfs/file.c7
-rw-r--r--fs/nfs/fscache.c24
-rw-r--r--fs/nfs/fscache.h9
-rw-r--r--fs/nfs/nfs4proc.c12
-rw-r--r--fs/nfs/pnfs.c5
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/nfs4state.c62
-rw-r--r--fs/nfsd/nfs4xdr.c6
-rw-r--r--fs/nfsd/nfsctl.c3
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nilfs2/dir.c4
-rw-r--r--fs/nilfs2/inode.c10
-rw-r--r--fs/nilfs2/recovery.c51
-rw-r--r--fs/nilfs2/segment.c10
-rw-r--r--fs/nilfs2/sysfs.c43
-rw-r--r--fs/notify/dnotify/dnotify.c6
-rw-r--r--fs/nsfs.c102
-rw-r--r--fs/ntfs3/file.c9
-rw-r--r--fs/ntfs3/inode.c51
-rw-r--r--fs/ntfs3/ntfs_fs.h5
-rw-r--r--fs/ocfs2/aops.c12
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/file.c28
-rw-r--r--fs/ocfs2/file.h1
-rw-r--r--fs/ocfs2/mmap.c6
-rw-r--r--fs/omfs/file.c4
-rw-r--r--fs/open.c51
-rw-r--r--fs/orangefs/inode.c39
-rw-r--r--fs/overlayfs/copy_up.c49
-rw-r--r--fs/overlayfs/params.c89
-rw-r--r--fs/overlayfs/super.c10
-rw-r--r--fs/pipe.c10
-rw-r--r--fs/posix_acl.c4
-rw-r--r--fs/proc/base.c118
-rw-r--r--fs/proc/consoles.c7
-rw-r--r--fs/proc/fd.c18
-rw-r--r--fs/proc/generic.c4
-rw-r--r--fs/proc/internal.h13
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/pstore/platform.c8
-rw-r--r--fs/qnx6/dir.c88
-rw-r--r--fs/qnx6/inode.c25
-rw-r--r--fs/qnx6/namei.c4
-rw-r--r--fs/qnx6/qnx6.h9
-rw-r--r--fs/read_write.c173
-rw-r--r--fs/reiserfs/inode.c57
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/select.c13
-rw-r--r--fs/signalfd.c4
-rw-r--r--fs/smb/client/Kconfig14
-rw-r--r--fs/smb/client/Makefile2
-rw-r--r--fs/smb/client/cifs_debug.c9
-rw-r--r--fs/smb/client/cifsacl.c226
-rw-r--r--fs/smb/client/cifsacl.h99
-rw-r--r--fs/smb/client/cifsencrypt.c144
-rw-r--r--fs/smb/client/cifsfs.c27
-rw-r--r--fs/smb/client/cifsfs.h1
-rw-r--r--fs/smb/client/cifsglob.h47
-rw-r--r--fs/smb/client/cifspdu.h6
-rw-r--r--fs/smb/client/cifsproto.h28
-rw-r--r--fs/smb/client/cifssmb.c68
-rw-r--r--fs/smb/client/compress.c390
-rw-r--r--fs/smb/client/compress.h90
-rw-r--r--fs/smb/client/compress/lz77.c235
-rw-r--r--fs/smb/client/compress/lz77.h15
-rw-r--r--fs/smb/client/connect.c22
-rw-r--r--fs/smb/client/file.c106
-rw-r--r--fs/smb/client/fs_context.c20
-rw-r--r--fs/smb/client/inode.c44
-rw-r--r--fs/smb/client/ioctl.c2
-rw-r--r--fs/smb/client/link.c4
-rw-r--r--fs/smb/client/misc.c20
-rw-r--r--fs/smb/client/reparse.c11
-rw-r--r--fs/smb/client/smb1ops.c2
-rw-r--r--fs/smb/client/smb2file.c6
-rw-r--r--fs/smb/client/smb2inode.c9
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2misc.c2
-rw-r--r--fs/smb/client/smb2ops.c365
-rw-r--r--fs/smb/client/smb2pdu.c116
-rw-r--r--fs/smb/client/smb2pdu.h8
-rw-r--r--fs/smb/client/smb2proto.h2
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/smb/client/smbdirect.c96
-rw-r--r--fs/smb/client/trace.h1
-rw-r--r--fs/smb/client/transport.c6
-rw-r--r--fs/smb/client/xattr.c4
-rw-r--r--fs/smb/common/smb2pdu.h2
-rw-r--r--fs/smb/common/smb2status.h (renamed from fs/smb/client/smb2status.h)6
-rw-r--r--fs/smb/common/smbacl.h121
-rw-r--r--fs/smb/server/connection.c38
-rw-r--r--fs/smb/server/connection.h4
-rw-r--r--fs/smb/server/mgmt/share_config.c15
-rw-r--r--fs/smb/server/mgmt/share_config.h4
-rw-r--r--fs/smb/server/mgmt/tree_connect.c9
-rw-r--r--fs/smb/server/mgmt/tree_connect.h4
-rw-r--r--fs/smb/server/mgmt/user_session.c9
-rw-r--r--fs/smb/server/oplock.c59
-rw-r--r--fs/smb/server/server.c2
-rw-r--r--fs/smb/server/smb2misc.c2
-rw-r--r--fs/smb/server/smb2pdu.c46
-rw-r--r--fs/smb/server/smb_common.c11
-rw-r--r--fs/smb/server/smb_common.h6
-rw-r--r--fs/smb/server/smbacl.h111
-rw-r--r--fs/smb/server/smbstatus.h1822
-rw-r--r--fs/smb/server/transport_rdma.c2
-rw-r--r--fs/smb/server/transport_tcp.c4
-rw-r--r--fs/smb/server/vfs.c19
-rw-r--r--fs/smb/server/vfs_cache.c3
-rw-r--r--fs/smb/server/xattr.h2
-rw-r--r--fs/squashfs/file.c86
-rw-r--r--fs/squashfs/file_direct.c19
-rw-r--r--fs/squashfs/inode.c7
-rw-r--r--fs/squashfs/page_actor.c11
-rw-r--r--fs/squashfs/page_actor.h6
-rw-r--r--fs/super.c8
-rw-r--r--fs/sysv/dir.c158
-rw-r--r--fs/sysv/itree.c8
-rw-r--r--fs/sysv/namei.c32
-rw-r--r--fs/sysv/sysv.h20
-rw-r--r--fs/tracefs/event_inode.c6
-rw-r--r--fs/tracefs/inode.c12
-rw-r--r--fs/tracefs/internal.h5
-rw-r--r--fs/ubifs/dir.c64
-rw-r--r--fs/ubifs/file.c13
-rw-r--r--fs/udf/dir.c28
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c12
-rw-r--r--fs/ufs/dir.c259
-rw-r--r--fs/ufs/inode.c12
-rw-r--r--fs/ufs/namei.c39
-rw-r--r--fs/ufs/ufs.h20
-rw-r--r--fs/ufs/util.h6
-rw-r--r--fs/vboxsf/file.c24
-rw-r--r--fs/verity/signature.c18
-rw-r--r--fs/xfs/libxfs/xfs_ag.c94
-rw-r--r--fs/xfs/libxfs/xfs_ag.h14
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c103
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h207
-rw-r--r--fs/xfs/libxfs/xfs_defer.c1
-rw-r--r--fs/xfs/libxfs/xfs_fs.h31
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c14
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c40
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c270
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h61
-rw-r--r--fs/xfs/libxfs/xfs_sb.c92
-rw-r--r--fs/xfs/libxfs/xfs_sb.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c4
-rw-r--r--fs/xfs/libxfs/xfs_types.h12
-rw-r--r--fs/xfs/scrub/bmap.c8
-rw-r--r--fs/xfs/scrub/bmap_repair.c2
-rw-r--r--fs/xfs/scrub/common.h29
-rw-r--r--fs/xfs/scrub/inode_repair.c12
-rw-r--r--fs/xfs/scrub/rtsummary.c11
-rw-r--r--fs/xfs/scrub/rtsummary.h2
-rw-r--r--fs/xfs/scrub/rtsummary_repair.c12
-rw-r--r--fs/xfs/scrub/scrub.h29
-rw-r--r--fs/xfs/scrub/tempfile.c2
-rw-r--r--fs/xfs/scrub/xfile.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c17
-rw-r--r--fs/xfs/xfs_bmap_util.c49
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_discard.c51
-rw-r--r--fs/xfs/xfs_exchrange.c143
-rw-r--r--fs/xfs/xfs_exchrange.h16
-rw-r--r--fs/xfs/xfs_file.c425
-rw-r--r--fs/xfs/xfs_fsmap.c433
-rw-r--r--fs/xfs/xfs_fsmap.h6
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_icache.c89
-rw-r--r--fs/xfs/xfs_inode.c86
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_ioctl.c145
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_mru_cache.c3
-rw-r--r--fs/xfs/xfs_qm.c48
-rw-r--r--fs/xfs/xfs_qm.h3
-rw-r--r--fs/xfs/xfs_qm_syscalls.c13
-rw-r--r--fs/xfs/xfs_quotaops.c55
-rw-r--r--fs/xfs/xfs_rtalloc.c918
-rw-r--r--fs/xfs/xfs_super.c13
-rw-r--r--fs/xfs/xfs_symlink.c2
-rw-r--r--fs/xfs/xfs_trace.h61
-rw-r--r--fs/xfs/xfs_trans_ail.c7
479 files changed, 13042 insertions, 10590 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index a97ceb105cd8..819c75233235 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -68,16 +68,22 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
+ unsigned long long pos = subreq->start + subreq->transferred;
int total, err;
- total = p9_client_read(fid, subreq->start + subreq->transferred,
- &subreq->io_iter, &err);
+ total = p9_client_read(fid, pos, &subreq->io_iter, &err);
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (subreq->rreq->origin != NETFS_DIO_READ)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (pos + total >= i_size_read(rreq->inode))
+ __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
- netfs_subreq_terminated(subreq, err ?: total, false);
+ if (!err)
+ subreq->transferred += total;
+
+ netfs_read_subreq_terminated(subreq, err, false);
}
/**
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index a183e213a4a5..21527189e430 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -55,12 +55,11 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
static int adfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- *pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 2e612834329a..e8c2c4535cb3 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -14,8 +14,6 @@
/* Ugly macros make the code more pretty. */
-#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st))))
-#define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
#define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
#define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data)
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
index 1b973a669d23..da3217ab6adb 100644
--- a/fs/affs/amigaffs.h
+++ b/fs/affs/amigaffs.h
@@ -49,12 +49,13 @@ struct affs_short_date {
struct affs_root_head {
__be32 ptype;
+ /* The following fields are not used, but kept as documentation. */
__be32 spare1;
__be32 spare2;
__be32 hash_size;
__be32 spare3;
__be32 checksum;
- __be32 hashtable[1];
+ __be32 hashtable[];
};
struct affs_root_tail {
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index b2bf7016e1b3..bd40d5f08810 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -17,13 +17,44 @@
#include <linux/iversion.h>
#include "affs.h"
+struct affs_dir_data {
+ unsigned long ino;
+ u64 cookie;
+};
+
static int affs_readdir(struct file *, struct dir_context *);
+static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct affs_dir_data *data = file->private_data;
+
+ return generic_llseek_cookie(file, offset, whence, &data->cookie);
+}
+
+static int affs_dir_open(struct inode *inode, struct file *file)
+{
+ struct affs_dir_data *data;
+
+ data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ file->private_data = data;
+ return 0;
+}
+
+static int affs_dir_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
const struct file_operations affs_dir_operations = {
+ .open = affs_dir_open,
.read = generic_read_dir,
- .llseek = generic_file_llseek,
+ .llseek = affs_dir_llseek,
.iterate_shared = affs_readdir,
.fsync = affs_file_fsync,
+ .release = affs_dir_release,
};
/*
@@ -45,6 +76,7 @@ static int
affs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
+ struct affs_dir_data *data = file->private_data;
struct super_block *sb = inode->i_sb;
struct buffer_head *dir_bh = NULL;
struct buffer_head *fh_bh = NULL;
@@ -59,7 +91,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
if (ctx->pos < 2) {
- file->private_data = (void *)0;
+ data->ino = 0;
if (!dir_emit_dots(file, ctx))
return 0;
}
@@ -80,8 +112,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
/* If the directory hasn't changed since the last call to readdir(),
* we can jump directly to where we left off.
*/
- ino = (u32)(long)file->private_data;
- if (ino && inode_eq_iversion(inode, file->f_version)) {
+ ino = data->ino;
+ if (ino && inode_eq_iversion(inode, data->cookie)) {
pr_debug("readdir() left off=%d\n", ino);
goto inside;
}
@@ -131,8 +163,8 @@ inside:
} while (ino);
}
done:
- file->f_version = inode_query_iversion(inode);
- file->private_data = (void *)(long)ino;
+ data->cookie = inode_query_iversion(inode);
+ data->ino = ino;
affs_brelse(fh_bh);
out_brelse_dir:
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 04c018e19602..a5a861dd5223 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -417,12 +417,11 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static int affs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- *pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -433,12 +432,12 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
static int affs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned int len, unsigned int copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
/* Clear Archived bit on file writes, as AmigaOS would do */
if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
@@ -648,7 +647,7 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio)
static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct folio *folio;
@@ -671,7 +670,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
- *pagep = &folio->page;
+ *foliop = folio;
if (folio_test_uptodate(folio))
return 0;
@@ -687,9 +686,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh, *prev_bh;
@@ -882,14 +880,14 @@ affs_truncate(struct inode *inode)
if (inode->i_size > AFFS_I(inode)->mmu_private) {
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
loff_t isize = inode->i_size;
int res;
- res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata);
+ res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &folio, &fsdata);
if (!res)
- res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
+ res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, folio, fsdata);
else
inode->i_size = AFFS_I(inode)->mmu_private;
mark_inode_dirty(inode);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index c3f0c45ae9a9..492d857a3fa0 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@@ -242,8 +243,10 @@ static void afs_fetch_data_notify(struct afs_operation *op)
req->error = error;
if (subreq) {
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
+ subreq->rreq->i_size = req->file_size;
+ if (req->pos + req->actual_len >= req->file_size)
+ __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
+ netfs_read_subreq_terminated(subreq, error, false);
req->subreq = NULL;
} else if (req->done) {
req->done(req);
@@ -261,6 +264,12 @@ static void afs_fetch_data_success(struct afs_operation *op)
afs_fetch_data_notify(op);
}
+static void afs_fetch_data_aborted(struct afs_operation *op)
+{
+ afs_check_for_remote_deletion(op);
+ afs_fetch_data_notify(op);
+}
+
static void afs_fetch_data_put(struct afs_operation *op)
{
op->fetch.req->error = afs_op_error(op);
@@ -271,7 +280,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_afs_rpc = afs_fs_fetch_data,
.issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success,
- .aborted = afs_check_for_remote_deletion,
+ .aborted = afs_fetch_data_aborted,
.failed = afs_fetch_data_notify,
.put = afs_fetch_data_put,
};
@@ -293,7 +302,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
op = afs_alloc_operation(req->key, vnode->volume);
if (IS_ERR(op)) {
if (req->subreq)
- netfs_subreq_terminated(req->subreq, PTR_ERR(op), false);
+ netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false);
return PTR_ERR(op);
}
@@ -304,14 +313,15 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
return afs_do_sync_operation(op);
}
-static void afs_issue_read(struct netfs_io_subrequest *subreq)
+static void afs_read_worker(struct work_struct *work)
{
+ struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
struct afs_read *fsreq;
fsreq = afs_alloc_read(GFP_NOFS);
if (!fsreq)
- return netfs_subreq_terminated(subreq, -ENOMEM, false);
+ return netfs_read_subreq_terminated(subreq, -ENOMEM, false);
fsreq->subreq = subreq;
fsreq->pos = subreq->start + subreq->transferred;
@@ -320,10 +330,17 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->vnode = vnode;
fsreq->iter = &subreq->io_iter;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
}
+static void afs_issue_read(struct netfs_io_subrequest *subreq)
+{
+ INIT_WORK(&subreq->work, afs_read_worker);
+ queue_work(system_long_wq, &subreq->work);
+}
+
static int afs_symlink_read_folio(struct file *file, struct folio *folio)
{
struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 79cd30775b7a..098fa034a1cc 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -304,6 +304,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
+ size_t count_before;
int ret;
_enter("{%u,%zu,%zu/%llu}",
@@ -345,10 +346,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
/* extract the returned data */
case 2:
- _debug("extract data %zu/%llu",
- iov_iter_count(call->iter), req->actual_len);
+ count_before = call->iov_len;
+ _debug("extract data %zu/%llu", count_before, req->actual_len);
ret = afs_extract_data(call, true);
+ if (req->subreq) {
+ req->subreq->transferred += count_before - call->iov_len;
+ netfs_read_subreq_progress(req->subreq, false);
+ }
if (ret < 0)
return ret;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 3acf5e050072..a95e77670b49 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -695,13 +695,18 @@ static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
struct afs_vnode *vnode = vp->vnode;
+ struct inode *inode = &vnode->netfs.inode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
- loff_t i_size = op->setattr.old_i_size;
+ loff_t old = op->setattr.old_i_size;
+
+ /* Note: inode->i_size was updated by afs_apply_status() inside
+ * the I/O and callback locks.
+ */
- if (size != i_size) {
- truncate_setsize(&vnode->netfs.inode, size);
+ if (size != old) {
+ truncate_pagecache(inode, size);
netfs_resize_file(&vnode->netfs, size, true);
fscache_resize_cookie(afs_vnode_cache(vnode), size);
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index e959640694c2..34107b55f834 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -89,10 +89,12 @@ static const struct afs_operation_ops afs_store_data_operation = {
*/
void afs_prepare_write(struct netfs_io_subrequest *subreq)
{
+ struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr];
+
//if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
// subreq->max_len = 512 * 1024;
//else
- subreq->max_len = 256 * 1024 * 1024;
+ stream->sreq_max_len = 256 * 1024 * 1024;
}
/*
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index f521e66d3bf6..024227aba4cd 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -355,6 +355,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
+ size_t count_before;
int ret;
_enter("{%u,%zu, %zu/%llu}",
@@ -391,10 +392,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
/* extract the returned data */
case 2:
- _debug("extract data %zu/%llu",
- iov_iter_count(call->iter), req->actual_len);
+ count_before = call->iov_len;
+ _debug("extract data %zu/%llu", count_before, req->actual_len);
ret = afs_extract_data(call, true);
+ if (req->subreq) {
+ req->subreq->transferred += count_before - call->iov_len;
+ netfs_read_subreq_progress(req->subreq, false);
+ }
if (ret < 0)
return ret;
diff --git a/fs/aio.c b/fs/aio.c
index 6066f64967b3..e8920178b50f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -100,7 +100,7 @@ struct kioctx {
unsigned long user_id;
- struct __percpu kioctx_cpu *cpu;
+ struct kioctx_cpu __percpu *cpu;
/*
* For percpu reqs_available, number of slots we move to/from global
diff --git a/fs/attr.c b/fs/attr.c
index 825007d5cda4..c04d19b58f12 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -487,9 +487,17 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
error = security_inode_setattr(idmap, dentry, attr);
if (error)
return error;
- error = try_break_deleg(inode, delegated_inode);
- if (error)
- return error;
+
+ /*
+ * If ATTR_DELEG is set, then these attributes are being set on
+ * behalf of the holder of a write delegation. We want to avoid
+ * breaking the delegation in this case.
+ */
+ if (!(ia_valid & ATTR_DELEG)) {
+ error = try_break_deleg(inode, delegated_inode);
+ if (error)
+ return error;
+ }
if (inode->i_op->setattr)
error = inode->i_op->setattr(idmap, dentry, attr);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 8c1d587b3eef..77c7991d89aa 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -62,6 +62,7 @@ struct autofs_info {
struct list_head expiring;
struct autofs_sb_info *sbi;
+ unsigned long exp_timeout;
unsigned long last_used;
int count;
@@ -81,6 +82,9 @@ struct autofs_info {
*/
#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
+#define AUTOFS_INF_EXPIRE_SET (1<<3) /* per-dentry expire timeout set for
+ this mount point.
+ */
struct autofs_wait_queue {
wait_queue_head_t queue;
struct autofs_wait_queue *next;
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 5bf781ea6d67..f011e026358e 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -128,7 +128,13 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
goto out;
}
+ /* Setting the per-dentry expire timeout requires a trailing
+ * path component, ie. no '/', so invert the logic of the
+ * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD.
+ */
err = check_name(param->path);
+ if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD)
+ err = err ? 0 : -EINVAL;
if (err) {
pr_warn("invalid path supplied for cmd(0x%08x)\n",
cmd);
@@ -396,16 +402,97 @@ static int autofs_dev_ioctl_catatonic(struct file *fp,
return 0;
}
-/* Set the autofs mount timeout */
+/*
+ * Set the autofs mount expire timeout.
+ *
+ * There are two places an expire timeout can be set, in the autofs
+ * super block info. (this is all that's needed for direct and offset
+ * mounts because there's a distinct mount corresponding to each of
+ * these) and per-dentry within within the dentry info. If a per-dentry
+ * timeout is set it will override the expire timeout set in the parent
+ * autofs super block info.
+ *
+ * If setting the autofs super block expire timeout the autofs_dev_ioctl
+ * size field will be equal to the autofs_dev_ioctl structure size. If
+ * setting the per-dentry expire timeout the mount point name is passed
+ * in the autofs_dev_ioctl path field and the size field updated to
+ * reflect this.
+ *
+ * Setting the autofs mount expire timeout sets the timeout in the super
+ * block info. struct. Setting the per-dentry timeout does a little more.
+ * If the timeout is equal to -1 the per-dentry timeout (and flag) is
+ * cleared which reverts to using the super block timeout, otherwise if
+ * timeout is 0 the timeout is set to this value and the flag is left
+ * set which disables expiration for the mount point, lastly the flag
+ * and the timeout are set enabling the dentry to use this timeout.
+ */
static int autofs_dev_ioctl_timeout(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- unsigned long timeout;
+ unsigned long timeout = param->timeout.timeout;
+
+ /* If setting the expire timeout for an individual indirect
+ * mount point dentry the mount trailing component path is
+ * placed in param->path and param->size adjusted to account
+ * for it otherwise param->size it is set to the structure
+ * size.
+ */
+ if (param->size == AUTOFS_DEV_IOCTL_SIZE) {
+ param->timeout.timeout = sbi->exp_timeout / HZ;
+ sbi->exp_timeout = timeout * HZ;
+ } else {
+ struct dentry *base = fp->f_path.dentry;
+ struct inode *inode = base->d_inode;
+ int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1;
+ struct dentry *dentry;
+ struct autofs_info *ino;
+
+ if (!autofs_type_indirect(sbi->type))
+ return -EINVAL;
+
+ /* An expire timeout greater than the superblock timeout
+ * could be a problem at shutdown but the super block
+ * timeout itself can change so all we can really do is
+ * warn the user.
+ */
+ if (timeout >= sbi->exp_timeout)
+ pr_warn("per-mount expire timeout is greater than "
+ "the parent autofs mount timeout which could "
+ "prevent shutdown\n");
+
+ inode_lock_shared(inode);
+ dentry = try_lookup_one_len(param->path, base, path_len);
+ inode_unlock_shared(inode);
+ if (IS_ERR_OR_NULL(dentry))
+ return dentry ? PTR_ERR(dentry) : -ENOENT;
+ ino = autofs_dentry_ino(dentry);
+ if (!ino) {
+ dput(dentry);
+ return -ENOENT;
+ }
+
+ if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET)
+ param->timeout.timeout = ino->exp_timeout / HZ;
+ else
+ param->timeout.timeout = sbi->exp_timeout / HZ;
+
+ if (timeout == -1) {
+ /* Revert to using the super block timeout */
+ ino->flags &= ~AUTOFS_INF_EXPIRE_SET;
+ ino->exp_timeout = 0;
+ } else {
+ /* Set the dentry expire flag and timeout.
+ *
+ * If timeout is 0 it will prevent the expire
+ * of this particular automount.
+ */
+ ino->flags |= AUTOFS_INF_EXPIRE_SET;
+ ino->exp_timeout = timeout * HZ;
+ }
+ dput(dentry);
+ }
- timeout = param->timeout.timeout;
- param->timeout.timeout = sbi->exp_timeout / HZ;
- sbi->exp_timeout = timeout * HZ;
return 0;
}
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 39d8c84c16f4..5c2d459e1e48 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -429,8 +429,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
if (!root)
return NULL;
- timeout = sbi->exp_timeout;
-
dentry = NULL;
while ((dentry = get_next_positive_subdir(dentry, root))) {
spin_lock(&sbi->fs_lock);
@@ -441,6 +439,11 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
}
spin_unlock(&sbi->fs_lock);
+ if (ino->flags & AUTOFS_INF_EXPIRE_SET)
+ timeout = ino->exp_timeout;
+ else
+ timeout = sbi->exp_timeout;
+
expired = should_expire(dentry, mnt, timeout, how);
if (!expired)
continue;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index cf792d4de4f1..ee2edccaef70 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -19,6 +19,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
INIT_LIST_HEAD(&ino->expiring);
ino->last_used = jiffies;
ino->sbi = sbi;
+ ino->exp_timeout = -1;
ino->count = 1;
}
return ino;
@@ -28,6 +29,7 @@ void autofs_clean_ino(struct autofs_info *ino)
{
ino->uid = GLOBAL_ROOT_UID;
ino->gid = GLOBAL_ROOT_GID;
+ ino->exp_timeout = -1;
ino->last_used = jiffies;
}
@@ -172,8 +174,7 @@ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi,
ret = autofs_check_pipe(pipe);
if (ret < 0) {
errorf(fc, "Invalid/unusable pipe");
- if (param->type != fs_value_is_file)
- fput(pipe);
+ fput(pipe);
return -EBADF;
}
diff --git a/fs/backing-file.c b/fs/backing-file.c
index afb557446c27..8860dac58c37 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -303,13 +303,16 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
return -EIO;
+ if (!out->f_op->splice_write)
+ return -EINVAL;
+
ret = file_remove_privs(ctx->user_file);
if (ret)
return ret;
old_cred = override_creds(ctx->cred);
file_start_write(out);
- ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+ ret = out->f_op->splice_write(pipe, out, ppos, len, flags);
file_end_write(out);
revert_creds(old_cred);
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index a7b425d3c8a0..331a17f3f113 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -272,16 +272,19 @@ bch2_acl_to_xattr(struct btree_trans *trans,
return xattr;
}
-struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
- struct dentry *dentry, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
{
- struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+ struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct posix_acl *acl = NULL;
+
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index 27e7eec0f278..fe730a6bf0c1 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t);
#ifdef CONFIG_BCACHEFS_POSIX_ACL
-struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index d9c5a92fa708..dc3a4024aab6 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -196,121 +196,119 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
int ret = 0;
/* allow for unknown fields */
- bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
- alloc_v1_val_size_bad,
+ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
+ c, alloc_v1_val_size_bad,
"incorrect value size (%zu < %u)",
bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
fsck_err:
return ret;
}
-int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_alloc_unpacked u;
int ret = 0;
- bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
- alloc_v2_unpack_error,
+ bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
+ c, alloc_v2_unpack_error,
"unpack error");
fsck_err:
return ret;
}
-int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_alloc_unpacked u;
int ret = 0;
- bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
- alloc_v2_unpack_error,
+ bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
+ c, alloc_v2_unpack_error,
"unpack error");
fsck_err:
return ret;
}
-int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err)
+int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
- struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+ struct bch_alloc_v4 a;
int ret = 0;
- bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
- alloc_v4_val_size_bad,
+ bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
+
+ bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
+ c, alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
- alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
+ alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
- bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
- alloc_v4_backpointers_start_bad,
+ bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
+ c, alloc_v4_backpointers_start_bad,
"invalid backpointers_start");
- bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
- alloc_key_data_type_bad,
+ bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
+ c, alloc_key_data_type_bad,
"invalid data type (got %u should be %u)",
- a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+ a.data_type, alloc_data_type(a, a.data_type));
for (unsigned i = 0; i < 2; i++)
- bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
- c, err,
- alloc_key_io_time_bad,
+ bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
+ c, alloc_key_io_time_bad,
"invalid io_time[%s]: %llu, max %llu",
i == READ ? "read" : "write",
- a.v->io_time[i], LRU_TIME_MAX);
+ a.io_time[i], LRU_TIME_MAX);
- unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
+ unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
offsetof(struct bch_alloc_v4, stripe_sectors)
- ? a.v->stripe_sectors
+ ? a.stripe_sectors
: 0;
- switch (a.v->data_type) {
+ switch (a.data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
bkey_fsck_err_on(stripe_sectors ||
- a.v->dirty_sectors ||
- a.v->cached_sectors ||
- a.v->stripe,
- c, err, alloc_key_empty_but_have_data,
+ a.dirty_sectors ||
+ a.cached_sectors ||
+ a.stripe,
+ c, alloc_key_empty_but_have_data,
"empty data type free but have data %u.%u.%u %u",
stripe_sectors,
- a.v->dirty_sectors,
- a.v->cached_sectors,
- a.v->stripe);
+ a.dirty_sectors,
+ a.cached_sectors,
+ a.stripe);
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- bkey_fsck_err_on(!a.v->dirty_sectors &&
+ bkey_fsck_err_on(!a.dirty_sectors &&
!stripe_sectors,
- c, err, alloc_key_dirty_sectors_0,
+ c, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_type_str(a.v->data_type));
+ bch2_data_type_str(a.data_type));
break;
case BCH_DATA_cached:
- bkey_fsck_err_on(!a.v->cached_sectors ||
- a.v->dirty_sectors ||
+ bkey_fsck_err_on(!a.cached_sectors ||
+ a.dirty_sectors ||
stripe_sectors ||
- a.v->stripe,
- c, err, alloc_key_cached_inconsistency,
+ a.stripe,
+ c, alloc_key_cached_inconsistency,
"data type inconsistency");
- bkey_fsck_err_on(!a.v->io_time[READ] &&
+ bkey_fsck_err_on(!a.io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
- c, err, alloc_key_cached_but_read_time_zero,
+ c, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
case BCH_DATA_stripe:
@@ -513,14 +511,13 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
: 0;
}
-int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
int ret = 0;
- bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
- bucket_gens_val_size_bad,
+ bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
+ c, bucket_gens_val_size_bad,
"bad val size (%zu != %zu)",
bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
fsck_err:
@@ -561,7 +558,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
struct bpos pos = alloc_gens_pos(iter.pos, &offset);
int ret2 = 0;
- if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+ if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret2)
@@ -829,7 +826,19 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
- struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
+
+ struct bch_alloc_v4 *new_a;
+ if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
+ new_a = bkey_s_to_alloc_v4(new).v;
+ } else {
+ BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
+
+ struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
+ ret = PTR_ERR_OR_ZERO(new_ka);
+ if (unlikely(ret))
+ goto err;
+ new_a = &new_ka->v;
+ }
if (flags & BTREE_TRIGGER_transactional) {
alloc_data_type_set(new_a, new_a->data_type);
@@ -1865,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_dev_do_discards(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
- goto put_ioref;
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_write_ref;
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-put_ioref:
percpu_ref_put(&ca->io_ref);
+put_write_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@@ -1959,8 +1968,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
break;
}
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
@@ -1970,18 +1979,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (discard_in_flight_add(ca, bucket, false))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
- goto put_ioref;
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_ref;
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
-put_ioref:
percpu_ref_put(&ca->io_ref);
+put_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@@ -2123,26 +2132,26 @@ static void bch2_do_invalidates_work(struct work_struct *work)
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
percpu_ref_put(&ca->io_ref);
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_dev_do_invalidates(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
- goto put_ioref;
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ goto put_ref;
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
-put_ioref:
percpu_ref_put(&ca->io_ref);
+put_ref:
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_do_invalidates(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8d2b62c9588e..fd790b03fbe1 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -82,6 +82,14 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
bucket_data_type(bucket) != bucket_data_type(ptr);
}
+/*
+ * It is my general preference to use unsigned types for unsigned quantities -
+ * however, these helpers are used in disk accounting calculations run by
+ * triggers where the output will be negated and added to an s64. unsigned is
+ * right out even though all these quantities will fit in 32 bits, since it
+ * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
+ * simpler option here.
+ */
static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
@@ -142,7 +150,9 @@ static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_typ
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{
- return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+ return a.data_type == BCH_DATA_cached
+ ? a.io_time[READ] & LRU_TIME_MAX
+ : 0;
}
#define DATA_TYPES_MOVABLE \
@@ -166,8 +176,8 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
* avoid overflowing LRU_TIME_BITS on a corrupted fs, when
* bucket_sectors_dirty is (much) bigger than bucket_size
*/
- u64 d = min(bch2_bucket_sectors_dirty(a),
- ca->mi.bucket_size);
+ u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
+ ca->mi.bucket_size);
return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
@@ -232,52 +242,48 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v1_invalid, \
+ .key_validate = bch2_alloc_v1_validate, \
.val_to_text = bch2_alloc_to_text, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v2_invalid, \
+ .key_validate = bch2_alloc_v2_validate, \
.val_to_text = bch2_alloc_to_text, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v3_invalid, \
+ .key_validate = bch2_alloc_v3_validate, \
.val_to_text = bch2_alloc_to_text, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 16, \
})
#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
- .key_invalid = bch2_alloc_v4_invalid, \
+ .key_validate = bch2_alloc_v4_validate, \
.val_to_text = bch2_alloc_to_text, \
.swab = bch2_alloc_v4_swab, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
})
-int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
- .key_invalid = bch2_bucket_gens_invalid, \
+ .key_validate = bch2_bucket_gens_validate, \
.val_to_text = bch2_bucket_gens_to_text, \
})
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index 47d9d006502c..f754a2951d8a 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -69,6 +69,7 @@ struct bch_alloc_v4 {
__u64 io_time[2];
__u32 stripe;
__u32 nr_external_backpointers;
+ /* end of fields in original version of alloc_v4 */
__u64 fragmentation_lru;
__u32 stripe_sectors;
__u32 pad;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 618d2ff0292e..8563c2d26847 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1603,7 +1603,8 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope
prt_newline(out);
}
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_dev *ca)
{
struct open_bucket *ob;
@@ -1613,7 +1614,8 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list)
+ if (ob->valid && !ob->on_partial_list &&
+ (!ca || ob->dev == ca->dev_idx))
bch2_open_bucket_to_text(out, c, ob);
spin_unlock(&ob->lock);
}
@@ -1738,7 +1740,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
- bch2_dev_usage_to_text(out, &stats);
+ bch2_dev_usage_to_text(out, ca, &stats);
prt_newline(out);
@@ -1756,11 +1758,12 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
}
-void bch2_print_allocator_stuck(struct bch_fs *c)
+static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
{
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n");
+ prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
+ c->opts.allocator_stuck_timeout);
prt_printf(&buf, "Allocator debug:\n");
printbuf_indent_add(&buf, 2);
@@ -1790,3 +1793,24 @@ void bch2_print_allocator_stuck(struct bch_fs *c)
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
+
+static inline unsigned allocator_wait_timeout(struct bch_fs *c)
+{
+ if (c->allocator_last_stuck &&
+ time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
+ return 0;
+
+ return c->opts.allocator_stuck_timeout * HZ;
+}
+
+void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+ unsigned t = allocator_wait_timeout(c);
+
+ if (t && closure_sync_timeout(cl, t)) {
+ c->allocator_last_stuck = jiffies;
+ bch2_print_allocator_stuck(c);
+ }
+
+ closure_sync(cl);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 6da9e7e29026..386d231ceca3 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -223,7 +223,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
void bch2_fs_allocator_foreground_init(struct bch_fs *);
void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
@@ -231,6 +231,11 @@ void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
-void bch2_print_allocator_stuck(struct bch_fs *);
+void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
+static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+ if (cl->closure_get_happened)
+ __bch2_wait_on_allocator(c, cl);
+}
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 3cc02479a982..d4da6343efa9 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -47,9 +47,8 @@ static bool extent_matches_bp(struct bch_fs *c,
return false;
}
-int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
@@ -68,8 +67,7 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
!bpos_eq(bp.k->p, bp_pos),
- c, err,
- backpointer_bucket_offset_wrong,
+ c, backpointer_bucket_offset_wrong,
"backpointer bucket_offset wrong");
fsck_err:
return ret;
@@ -763,27 +761,22 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
btree < BTREE_ID_NR && !ret;
btree++) {
unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
- struct btree_iter iter;
- struct btree *b;
if (!(BIT_ULL(btree) & btree_leaf_mask) &&
!(BIT_ULL(btree) & btree_interior_mask))
continue;
- bch2_trans_begin(trans);
-
- __for_each_btree_node(trans, iter, btree,
+ ret = __for_each_btree_node(trans, iter, btree,
btree == start.btree ? start.pos : POS_MIN,
- 0, depth, BTREE_ITER_prefetch, b, ret) {
+ 0, depth, BTREE_ITER_prefetch, b, ({
mem_may_pin -= btree_buf_bytes(b);
if (mem_may_pin <= 0) {
c->btree_cache.pinned_nodes_end = *end =
BBPOS(btree, b->key.k.p);
- bch2_trans_iter_exit(trans, &iter);
- return 0;
+ break;
}
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
}
return ret;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 6021de1c5e98..7daecadb764e 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -18,14 +18,13 @@ static inline u64 swab40(u64 x)
((x & 0xff00000000ULL) >> 32));
}
-int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
- enum bch_validate_flags, struct printbuf *);
+int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags);
void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_backpointer_swab(struct bkey_s);
#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
- .key_invalid = bch2_backpointer_invalid, \
+ .key_validate = bch2_backpointer_validate, \
.val_to_text = bch2_backpointer_k_to_text, \
.swab = bch2_backpointer_swab, \
.min_val_size = 32, \
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 91361a167dcd..0c7086e00d18 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -447,6 +447,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(blocked_journal_low_on_space) \
x(blocked_journal_low_on_pin) \
x(blocked_journal_max_in_flight) \
+ x(blocked_key_cache_flush) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
x(blocked_write_buffer_full) \
@@ -893,6 +894,8 @@ struct bch_fs {
struct bch_fs_usage_base __percpu *usage;
u64 __percpu *online_reserved;
+ unsigned long allocator_last_stuck;
+
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 74a60b1a4ddf..14ce726bf5a3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -675,7 +675,10 @@ struct bch_sb_field_ext {
x(btree_subvolume_children, BCH_VERSION(1, 6)) \
x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
- x(disk_accounting_v2, BCH_VERSION(1, 9))
+ x(disk_accounting_v2, BCH_VERSION(1, 9)) \
+ x(disk_accounting_v3, BCH_VERSION(1, 10)) \
+ x(disk_accounting_inum, BCH_VERSION(1, 11)) \
+ x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -836,6 +839,8 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
struct bch_sb, flags[5], 0, 16);
+LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
+ struct bch_sb, flags[5], 16, 32);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 936357149cf0..e34cb2bf329c 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -10,9 +10,10 @@
#include "vstructs.h"
enum bch_validate_flags {
- BCH_VALIDATE_write = (1U << 0),
- BCH_VALIDATE_commit = (1U << 1),
- BCH_VALIDATE_journal = (1U << 2),
+ BCH_VALIDATE_write = BIT(0),
+ BCH_VALIDATE_commit = BIT(1),
+ BCH_VALIDATE_journal = BIT(2),
+ BCH_VALIDATE_silent = BIT(3),
};
#if 0
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5f07cf853d0c..88d8958281e8 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -27,27 +27,27 @@ const char * const bch2_bkey_types[] = {
NULL
};
-static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err)
+static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
return 0;
}
#define bch2_bkey_ops_deleted ((struct bkey_ops) { \
- .key_invalid = deleted_key_invalid, \
+ .key_validate = deleted_key_validate, \
})
#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \
- .key_invalid = deleted_key_invalid, \
+ .key_validate = deleted_key_validate, \
})
-static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err)
+static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
int ret = 0;
- bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
- bkey_val_size_nonzero,
+ bkey_fsck_err_on(bkey_val_bytes(k.k),
+ c, bkey_val_size_nonzero,
"incorrect value size (%zu != 0)",
bkey_val_bytes(k.k));
fsck_err:
@@ -55,11 +55,11 @@ fsck_err:
}
#define bch2_bkey_ops_error ((struct bkey_ops) { \
- .key_invalid = empty_val_key_invalid, \
+ .key_validate = empty_val_key_validate, \
})
-static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err)
+static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
return 0;
}
@@ -73,17 +73,17 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
}
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
- .key_invalid = key_type_cookie_invalid, \
+ .key_validate = key_type_cookie_validate, \
.val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
- .key_invalid = empty_val_key_invalid, \
+ .key_validate = empty_val_key_validate, \
})
-static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err)
+static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
return 0;
}
@@ -98,9 +98,9 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
datalen, min(datalen, 32U), d.v->data);
}
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
- .key_invalid = key_type_inline_data_invalid, \
- .val_to_text = key_type_inline_data_to_text, \
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \
+ .key_validate = key_type_inline_data_validate, \
+ .val_to_text = key_type_inline_data_to_text, \
})
static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
@@ -110,7 +110,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
}
#define bch2_bkey_ops_set ((struct bkey_ops) { \
- .key_invalid = empty_val_key_invalid, \
+ .key_validate = empty_val_key_validate, \
.key_merge = key_type_set_merge, \
})
@@ -123,9 +123,8 @@ const struct bkey_ops bch2_bkey_ops[] = {
const struct bkey_ops bch2_bkey_null_ops = {
};
-int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
return 0;
@@ -133,15 +132,15 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
int ret = 0;
- bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
- bkey_val_size_too_small,
+ bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
+ c, bkey_val_size_too_small,
"bad val size (%zu < %u)",
bkey_val_bytes(k.k), ops->min_val_size);
- if (!ops->key_invalid)
+ if (!ops->key_validate)
return 0;
- ret = ops->key_invalid(c, k, flags, err);
+ ret = ops->key_validate(c, k, flags);
fsck_err:
return ret;
}
@@ -161,18 +160,17 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
}
-int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum btree_node_type type,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum btree_node_type type,
+ enum bch_validate_flags flags)
{
if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
return 0;
int ret = 0;
- bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
- bkey_u64s_too_small,
+ bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
+ c, bkey_u64s_too_small,
"u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
if (type >= BKEY_TYPE_NR)
@@ -180,8 +178,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
(type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
- !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
- bkey_invalid_type_for_btree,
+ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
+ c, bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
bch2_btree_node_type_str(type),
k.k->type < KEY_TYPE_MAX
@@ -189,17 +187,17 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
: "(unknown)");
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- bkey_fsck_err_on(k.k->size == 0, c, err,
- bkey_extent_size_zero,
+ bkey_fsck_err_on(k.k->size == 0,
+ c, bkey_extent_size_zero,
"size == 0");
- bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
- bkey_extent_size_greater_than_offset,
+ bkey_fsck_err_on(k.k->size > k.k->p.offset,
+ c, bkey_extent_size_greater_than_offset,
"size greater than offset (%u > %llu)",
k.k->size, k.k->p.offset);
} else {
- bkey_fsck_err_on(k.k->size, c, err,
- bkey_size_nonzero,
+ bkey_fsck_err_on(k.k->size,
+ c, bkey_size_nonzero,
"size != 0");
}
@@ -207,12 +205,12 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_id btree = type - 1;
if (btree_type_has_snapshots(btree)) {
- bkey_fsck_err_on(!k.k->p.snapshot, c, err,
- bkey_snapshot_zero,
+ bkey_fsck_err_on(!k.k->p.snapshot,
+ c, bkey_snapshot_zero,
"snapshot == 0");
} else if (!btree_type_has_snapshot_field(btree)) {
- bkey_fsck_err_on(k.k->p.snapshot, c, err,
- bkey_snapshot_nonzero,
+ bkey_fsck_err_on(k.k->p.snapshot,
+ c, bkey_snapshot_nonzero,
"nonzero snapshot");
} else {
/*
@@ -221,34 +219,33 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
*/
}
- bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
- bkey_at_pos_max,
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
+ c, bkey_at_pos_max,
"key at POS_MAX");
}
fsck_err:
return ret;
}
-int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
- enum bch_validate_flags flags,
- struct printbuf *err)
+ enum bch_validate_flags flags)
{
- return __bch2_bkey_invalid(c, k, type, flags, err) ?:
- bch2_bkey_val_invalid(c, k, flags, err);
+ return __bch2_bkey_validate(c, k, type, flags) ?:
+ bch2_bkey_val_validate(c, k, flags);
}
int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k, struct printbuf *err)
+ struct bkey_s_c k, enum bch_validate_flags flags)
{
int ret = 0;
- bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
- bkey_before_start_of_btree_node,
+ bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
+ c, bkey_before_start_of_btree_node,
"key before start of btree node");
- bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
- bkey_after_end_of_btree_node,
+ bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
+ c, bkey_after_end_of_btree_node,
"key past end of btree node");
fsck_err:
return ret;
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index baef0722f5fb..3df3dd2723a1 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -14,15 +14,15 @@ extern const char * const bch2_bkey_types[];
extern const struct bkey_ops bch2_bkey_null_ops;
/*
- * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
* invalid, entire key will be deleted.
*
* When invalid, error string is returned via @err. @rw indicates whether key is
* being read or written; more aggressive checks can be enabled when rw == WRITE.
*/
struct bkey_ops {
- int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err);
+ int (*key_validate)(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
@@ -48,14 +48,13 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
: &bch2_bkey_null_ops;
}
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bch_validate_flags, struct printbuf *);
-int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
- enum bch_validate_flags, struct printbuf *);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
- struct bkey_s_c, struct printbuf *);
+int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+ enum bch_validate_flags);
+int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+ enum bch_validate_flags);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index f5d85b50b6f2..e52a06d3418c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return b;
}
+void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
+{
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+}
+
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -736,6 +746,13 @@ out:
start_time);
memalloc_nofs_restore(flags);
+
+ int ret = bch2_trans_relock(trans);
+ if (unlikely(ret)) {
+ bch2_btree_node_to_freelist(c, b);
+ return ERR_PTR(ret);
+ }
+
return b;
err:
mutex_lock(&bc->lock);
@@ -856,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
bch2_btree_node_read(trans, b, sync);
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+
if (!sync)
return NULL;
@@ -974,6 +995,10 @@ retry:
bch2_btree_node_wait_on_read(b);
+ ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+
/*
* should_be_locked is not set on this path yet, so we need to
* relock it specifically:
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index c0eb87a057cc..f82064007127 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -12,6 +12,8 @@ struct btree_iter;
void bch2_recalc_btree_reserve(struct bch_fs *);
+void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
+
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6cbf2aa6a947..eb3002c4eae7 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -741,12 +741,9 @@ fsck_err:
static int bch2_mark_superblocks(struct bch_fs *c)
{
- mutex_lock(&c->sb_lock);
gc_pos_set(c, gc_phase(GC_PHASE_sb));
- int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
- mutex_unlock(&c->sb_lock);
- return ret;
+ return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
}
static void bch2_gc_free(struct bch_fs *c)
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2c424435ca4a..56ea9a77cd4a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -836,14 +836,13 @@ fsck_err:
return ret;
}
-static int bset_key_invalid(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k,
- bool updated_range, int rw,
- struct printbuf *err)
+static int bset_key_validate(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k,
+ bool updated_range, int rw)
{
- return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
- (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
- (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+ return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?:
+ (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0);
}
static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
@@ -858,12 +857,9 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
if (!bkeyp_u64s_valid(&b->format, k))
return false;
- struct printbuf buf = PRINTBUF;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
- printbuf_exit(&buf);
- return ret;
+ return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent);
}
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
@@ -915,19 +911,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
u = __bkey_disassemble(b, k, &tmp);
- printbuf_reset(&buf);
- if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
- printbuf_reset(&buf);
- bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, u.s_c);
-
- btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bad_bkey,
- "invalid bkey: %s", buf.buf);
+ ret = bset_key_validate(c, b, u.s_c, updated_range, write);
+ if (ret == -BCH_ERR_fsck_delete_bkey)
goto drop_this_key;
- }
+ if (ret)
+ goto fsck_err;
if (write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
@@ -1228,23 +1216,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
- printbuf_reset(&buf);
-
- if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
+ ret = bch2_bkey_val_validate(c, u.s_c, READ);
+ if (ret == -BCH_ERR_fsck_delete_bkey ||
(bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
- printbuf_reset(&buf);
-
- prt_printf(&buf, "invalid bkey: ");
- bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
- prt_printf(&buf, "\n ");
- bch2_bkey_val_to_text(&buf, c, u.s_c);
-
- btree_err(-BCH_ERR_btree_node_read_err_fixable,
- c, NULL, b, i, k,
- btree_node_bad_bkey,
- "%s", buf.buf);
-
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -1253,6 +1228,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset_end(b, b->set);
continue;
}
+ if (ret)
+ goto fsck_err;
if (u.k->type == KEY_TYPE_btree_ptr_v2) {
struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
@@ -1767,6 +1744,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
set_btree_node_read_in_flight(b);
+ /* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
+ bch2_trans_unlock(trans);
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
@@ -1952,18 +1931,14 @@ static void btree_node_write_endio(struct bio *bio)
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
- struct printbuf buf = PRINTBUF;
bool saw_error;
- int ret;
-
- ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
- BKEY_TYPE_btree, WRITE, &buf);
- if (ret)
- bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
- printbuf_exit(&buf);
- if (ret)
+ int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
+ BKEY_TYPE_btree, WRITE);
+ if (ret) {
+ bch2_fs_inconsistent(c, "invalid btree node key before write");
return ret;
+ }
ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 36872207f09b..2e84d22e17bd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1900,6 +1900,7 @@ err:
goto out;
}
+/* Only kept for -tools */
struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
{
struct btree *b;
@@ -1921,6 +1922,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
bch2_trans_verify_not_in_restart(trans);
bch2_btree_iter_verify(iter);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (ret)
+ goto err;
+
+
struct btree_path *path = btree_iter_path(trans, iter);
/* already at end? */
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c7725865309c..222b7ce8a901 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
_btree_id, _pos, _flags, KEY_TYPE_##_type))
+#define bkey_val_copy(_dst_v, _src_k) \
+do { \
+ unsigned b = min_t(unsigned, sizeof(*_dst_v), \
+ bkey_val_bytes(_src_k.k)); \
+ memcpy(_dst_v, _src_k.v, b); \
+ if (b < sizeof(*_dst_v)) \
+ memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \
+} while (0)
+
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
unsigned flags, unsigned type,
@@ -600,23 +609,35 @@ void bch2_trans_srcu_unlock(struct btree_trans *);
u32 bch2_trans_begin(struct btree_trans *);
-/*
- * XXX
- * this does not handle transaction restarts from bch2_btree_iter_next_node()
- * correctly
- */
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _locks_want, _depth, _flags, _b, _ret) \
- for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
- _start, _locks_want, _depth, _flags); \
- (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \
- !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \
- (_b) = bch2_btree_iter_next_node(&(_iter)))
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ _locks_want, _depth, _flags, _b, _do) \
+({ \
+ bch2_trans_begin((_trans)); \
+ \
+ struct btree_iter _iter; \
+ bch2_trans_node_iter_init((_trans), &_iter, (_btree_id), \
+ _start, _locks_want, _depth, _flags); \
+ int _ret3 = 0; \
+ do { \
+ _ret3 = lockrestart_do((_trans), ({ \
+ struct btree *_b = bch2_btree_iter_peek_node(&_iter); \
+ if (!_b) \
+ break; \
+ \
+ PTR_ERR_OR_ZERO(_b) ?: (_do); \
+ })) ?: \
+ lockrestart_do((_trans), \
+ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \
+ } while (!_ret3); \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _flags, _b, _ret) \
- __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- 0, 0, _flags, _b, _ret)
+ _flags, _b, _do) \
+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ 0, 0, _flags, _b, _do)
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 74933490aaba..c1657182c275 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -530,6 +530,8 @@ static void __journal_keys_sort(struct journal_keys *keys)
{
sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
+ cond_resched();
+
struct journal_key *dst = keys->data;
darray_for_each(*keys, src) {
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index f2f2e525460b..fda7998734cb 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -497,11 +497,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
path->l[1].b = NULL;
- if (bch2_btree_node_relock_notrace(trans, path, 0)) {
- path->uptodate = BTREE_ITER_UPTODATE;
- return 0;
- }
-
int ret;
do {
ret = btree_path_traverse_cached_fast(trans, path);
@@ -731,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ path->should_be_locked = false;
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
@@ -782,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+
+ /*
+ * Scanning is expensive while a rehash is in progress - most elements
+ * will be on the new hashtable, if it's in progress
+ *
+ * A rehash could still start while we're scanning - that's ok, we'll
+ * still see most elements.
+ */
+ if (unlikely(tbl->nest)) {
+ rcu_read_unlock();
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ return SHRINK_STOP;
+ }
+
if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0;
start = bc->shrink_iter;
@@ -789,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
do {
struct rhash_head *pos, *next;
- pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+ pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
while (!rht_is_a_nulls(pos)) {
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
@@ -870,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
while (atomic_long_read(&bc->nr_keys)) {
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- if (tbl)
+ if (tbl) {
+ if (tbl->nest) {
+ /* wait for in progress rehash */
+ rcu_read_unlock();
+ mutex_lock(&bc->table.mutex);
+ mutex_unlock(&bc->table.mutex);
+ rcu_read_lock();
+ continue;
+ }
for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
+ ck = container_of(pos, struct bkey_cached, hash);
bkey_cached_evict(bc, ck);
list_add(&ck->list, &items);
}
+ }
rcu_read_unlock();
}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index e6b2cd0dd2c1..51d6289b8dee 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -11,13 +11,27 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
return max_t(ssize_t, 0, nr_dirty - max_dirty);
}
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
{
size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
- return nr_dirty > max_dirty;
+ return nr_dirty - max_dirty;
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+ return __bch2_btree_key_cache_must_wait(c) > 0;
+}
+
+static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
+{
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+ size_t max_dirty = 2048 + (nr_keys * 5) / 8;
+
+ return nr_dirty <= max_dirty;
}
int bch2_btree_key_cache_journal_flush(struct journal *,
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 001107226377..b28c649c6838 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -530,7 +530,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
printbuf_exit(&buf);
- BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+ BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0));
ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
if (ret)
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index cca336fe46e9..a0101d9c5d83 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -712,7 +712,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
a->k.version = journal_pos_to_bversion(&trans->journal_res,
(u64 *) entry - (u64 *) trans->journal_entries);
BUG_ON(bversion_zero(a->k.version));
- ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false);
+ ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false);
if (ret)
goto revert_fs_usage;
}
@@ -798,7 +798,7 @@ revert_fs_usage:
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, false);
+ bch2_accounting_mem_mod_locked(trans, a.c, false, false);
bch2_accounting_neg(a);
}
percpu_up_read(&c->mark_lock);
@@ -818,50 +818,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
}
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
- enum bch_validate_flags flags,
- struct btree_insert_entry *i,
- struct printbuf *err)
-{
- struct bch_fs *c = trans->c;
-
- printbuf_reset(err);
- prt_printf(err, "invalid bkey on insert from %s -> %ps\n",
- trans->fn, (void *) i->ip_allocated);
- printbuf_indent_add(err, 2);
-
- bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
- prt_newline(err);
-
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
- bch2_print_string_as_lines(KERN_ERR, err->buf);
-
- bch2_inconsistent_error(c);
- bch2_dump_trans_updates(trans);
-
- return -EINVAL;
-}
-
-static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans,
- struct jset_entry *i)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn);
- printbuf_indent_add(&buf, 2);
-
- bch2_journal_entry_to_text(&buf, c, i);
- prt_newline(&buf);
-
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
-
- bch2_inconsistent_error(c);
- bch2_dump_trans_updates(trans);
-
- return -EINVAL;
-}
-
static int bch2_trans_commit_journal_pin_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
@@ -927,7 +883,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
static int journal_reclaim_wait_done(struct bch_fs *c)
{
int ret = bch2_journal_error(&c->journal) ?:
- !bch2_btree_key_cache_must_wait(c);
+ bch2_btree_key_cache_wait_done(c);
if (!ret)
journal_reclaim_kick(&c->journal);
@@ -973,9 +929,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
bch2_trans_unlock(trans);
trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+ track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
+
+ track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
+
if (ret < 0)
break;
@@ -1060,20 +1020,19 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
trans_for_each_update(trans, i) {
- struct printbuf buf = PRINTBUF;
enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
- if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->bkey_type, invalid_flags, &buf)))
- ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
- btree_insert_entry_checks(trans, i);
- printbuf_exit(&buf);
-
- if (ret)
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, invalid_flags);
+ if (unlikely(ret)){
+ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+ trans->fn, (void *) i->ip_allocated);
return ret;
+ }
+ btree_insert_entry_checks(trans, i);
}
for (struct jset_entry *i = trans->journal_entries;
@@ -1084,13 +1043,14 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
- if (unlikely(bch2_journal_entry_validate(c, NULL, i,
- bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, invalid_flags)))
- ret = bch2_trans_commit_journal_entry_invalid(trans, i);
-
- if (ret)
+ ret = bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, invalid_flags);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
+ trans->fn);
return ret;
+ }
}
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 31ee50184be2..8fd112026e7a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
: 0;
int ret;
+ b = bch2_btree_node_mem_alloc(trans, interior_node);
+ if (IS_ERR(b))
+ return b;
+
+ BUG_ON(b->ob.nr);
+
mutex_lock(&c->btree_reserve_cache_lock);
if (c->btree_reserve_cache_nr > nr_reserve) {
struct btree_alloc *a =
@@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
obs = a->ob;
bkey_copy(&tmp.k, &a->k);
mutex_unlock(&c->btree_reserve_cache_lock);
- goto mem_alloc;
+ goto out;
}
mutex_unlock(&c->btree_reserve_cache_lock);
-
retry:
ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?:
@@ -341,7 +346,7 @@ retry:
c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
- return ERR_PTR(ret);
+ goto err;
if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
@@ -360,19 +365,16 @@ retry:
bch2_open_bucket_get(c, wp, &obs);
bch2_alloc_sectors_done(c, wp);
-mem_alloc:
- b = bch2_btree_node_mem_alloc(trans, interior_node);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-
- /* we hold cannibalize_lock: */
- BUG_ON(IS_ERR(b));
- BUG_ON(b->ob.nr);
-
+out:
bkey_copy(&b->key, &tmp.k);
b->ob = obs;
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
return b;
+err:
+ bch2_btree_node_to_freelist(c, b);
+ return ERR_PTR(ret);
}
static struct btree *bch2_btree_node_alloc(struct btree_update *as,
@@ -1264,7 +1266,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
bch2_trans_unlock(trans);
- closure_sync(&cl);
+ bch2_wait_on_allocator(c, &cl);
} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
}
@@ -1364,18 +1366,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
- btree_node_type(b), WRITE, &buf) ?:
- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
- printbuf_reset(&buf);
- prt_printf(&buf, "inserting invalid bkey\n ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
- prt_printf(&buf, "\n ");
- bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
- btree_node_type(b), WRITE, &buf);
- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
+ if (bch2_bkey_validate(c, bkey_i_to_s_c(insert),
+ btree_node_type(b), BCH_VALIDATE_write) ?:
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) {
+ bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
dump_stack();
}
@@ -2447,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
}
new_hash = bch2_btree_node_mem_alloc(trans, false);
+ ret = PTR_ERR_OR_ZERO(new_hash);
+ if (ret)
+ goto err;
}
path->intent_ref++;
@@ -2454,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
commit_flags, skip_triggers);
--path->intent_ref;
- if (new_hash) {
- mutex_lock(&c->btree_cache.lock);
- list_move(&new_hash->list, &c->btree_cache.freeable);
- mutex_unlock(&c->btree_cache.lock);
-
- six_unlock_write(&new_hash->c.lock);
- six_unlock_intent(&new_hash->c.lock);
- }
+ if (new_hash)
+ bch2_btree_node_to_freelist(c, new_hash);
+err:
closure_sync(&cl);
bch2_btree_cache_cannibalize_unlock(trans);
return ret;
@@ -2530,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
b = bch2_btree_node_mem_alloc(trans, false);
bch2_btree_cache_cannibalize_unlock(trans);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ return ret;
+
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
b->c.level = level;
@@ -2561,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
{
- bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
+ bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
}
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2650a0d24663..721bbe1dffc1 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -71,17 +71,21 @@ bch2_fs_usage_read_short(struct bch_fs *c)
return ret;
}
-void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
+void bch2_dev_usage_to_text(struct printbuf *out,
+ struct bch_dev *ca,
+ struct bch_dev_usage *usage)
{
prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
bch2_prt_data_type(out, i);
prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
- usage->d[i].buckets,
- usage->d[i].sectors,
- usage->d[i].fragmented);
+ usage->d[i].buckets,
+ usage->d[i].sectors,
+ usage->d[i].fragmented);
}
+
+ prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
}
static int bch2_check_fix_ptr(struct btree_trans *trans,
@@ -96,12 +100,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (!ca) {
- if (fsck_err(trans, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
+ trans, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
*do_update = true;
return 0;
}
@@ -558,7 +563,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
- if (insert)
+ if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
ret = -EIO;
goto err;
}
@@ -695,7 +700,8 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
+ enum btree_iter_update_trigger_flags flags,
+ s64 *replicas_sectors)
{
bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -704,7 +710,6 @@ static int __trigger_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 replicas_sectors = 0;
int ret = 0;
struct disk_accounting_pos acc_replicas_key = {
@@ -735,7 +740,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (ret)
return ret;
} else if (!p.has_ec) {
- replicas_sectors += disk_sectors;
+ *replicas_sectors += disk_sectors;
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -773,7 +778,7 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs) {
- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
if (ret)
return ret;
}
@@ -783,7 +788,7 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_snapshot,
.snapshot.id = k.k->p.snapshot,
};
- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
if (ret)
return ret;
}
@@ -803,16 +808,21 @@ static int __trigger_extent(struct btree_trans *trans,
.type = BCH_DISK_ACCOUNTING_btree,
.btree.id = btree_id,
};
- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
if (ret)
return ret;
- }
-
- if (bch2_bkey_rebalance_opts(k)) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_rebalance_work,
+ } else {
+ bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ struct disk_accounting_pos acc_inum_key = {
+ .type = BCH_DISK_ACCOUNTING_inum,
+ .inum.inum = k.k->p.inode,
};
- ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
+ s64 v[3] = {
+ insert ? 1 : -1,
+ insert ? k.k->size : -((s64) k.k->size),
+ *replicas_sectors,
+ };
+ ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
if (ret)
return ret;
}
@@ -825,6 +835,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
@@ -840,21 +851,53 @@ int bch2_trigger_extent(struct btree_trans *trans,
new_ptrs_bytes))
return 0;
- if (flags & BTREE_TRIGGER_transactional) {
- struct bch_fs *c = trans->c;
- int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
- (int) bch2_bkey_needs_rebalance(c, old);
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+ s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
- if (mod) {
+ if (old.k->type) {
+ int ret = __trigger_extent(trans, btree, level, old,
+ flags & ~BTREE_TRIGGER_insert,
+ &old_replicas_sectors);
+ if (ret)
+ return ret;
+ }
+
+ if (new.k->type) {
+ int ret = __trigger_extent(trans, btree, level, new.s_c,
+ flags & ~BTREE_TRIGGER_overwrite,
+ &new_replicas_sectors);
+ if (ret)
+ return ret;
+ }
+
+ int need_rebalance_delta = 0;
+ s64 need_rebalance_sectors_delta = 0;
+
+ s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+ need_rebalance_delta -= s != 0;
+ need_rebalance_sectors_delta -= s;
+
+ s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
+ need_rebalance_delta += s != 0;
+ need_rebalance_sectors_delta += s;
+
+ if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
- new.k->p, mod > 0);
+ new.k->p, need_rebalance_delta > 0);
if (ret)
return ret;
}
- }
- if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
- return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
+ if (need_rebalance_sectors_delta) {
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_rebalance_work,
+ };
+ int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
+ flags & BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
+ }
+ }
return 0;
}
@@ -897,7 +940,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
enum bch_data_type type,
unsigned sectors)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
int ret = 0;
@@ -907,7 +949,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
return PTR_ERR(a);
if (a->v.data_type && type && a->v.data_type != type) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
bucket_metadata_type_mismatch,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
@@ -1028,13 +1070,18 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
enum btree_iter_update_trigger_flags flags)
{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ struct bch_fs *c = trans->c;
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_layout layout = ca->disk_sb.sb->layout;
+ mutex_unlock(&c->sb_lock);
+
u64 bucket = 0;
unsigned i, bucket_sectors = 0;
int ret;
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ for (i = 0; i < layout.nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout.sb_offset[i]);
if (offset == BCH_SB_SECTOR) {
ret = bch2_trans_mark_metadata_sectors(trans, ca,
@@ -1045,7 +1092,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c
}
ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
- offset + (1 << layout->sb_max_size_bits),
+ offset + (1 << layout.sb_max_size_bits),
BCH_DATA_sb, &bucket, &bucket_sectors, flags);
if (ret)
return ret;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 2d35eeb24a2d..edbdffd508fc 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -212,7 +212,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
return ret;
}
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index ec1b636ef78d..f9fb150eda70 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -93,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
.dev_bucket = (u64) dev << 56 | bucket,
.journal_seq = journal_seq,
};
- size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
+ size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
int ret = 0;
mutex_lock(&b->lock);
@@ -106,8 +106,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
for (i = 0; i < size; i++)
nr_elements += t->d[i].journal_seq > flushed_seq;
- new_bits = t->bits + (nr_elements * 3 > size);
-
+ new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
+realloc:
n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
if (!n) {
ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
@@ -115,7 +115,16 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
}
retry_rehash:
+ if (nr_rehashes_this_size == 3) {
+ new_bits++;
+ nr_rehashes_this_size = 0;
+ kvfree(n);
+ goto realloc;
+ }
+
nr_rehashes++;
+ nr_rehashes_this_size++;
+
bucket_table_init(n, new_bits);
tmp = new;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0087b8555ead..004894ad4147 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -20,6 +20,76 @@
#include "subvolume.h"
#include "trace.h"
+static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr)
+ bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
+}
+
+static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!bch2_dev_tryget(c, ptr->dev)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+ bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
+ }
+ return false;
+ }
+ }
+ return true;
+}
+
+static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ }
+}
+
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+ if (ctxt) {
+ bool locked;
+
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+ list_empty(&ctxt->ios));
+
+ if (!locked)
+ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+ } else {
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
+ bkey_for_each_ptr(ptrs, ptr2) {
+ if (ptr2 == ptr)
+ break;
+
+ bucket = PTR_BUCKET_POS(ca, ptr2);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ }
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
{
if (trace_move_extent_finish_enabled()) {
@@ -250,10 +320,8 @@ restart_drop_extra_replicas:
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
- struct printbuf err = PRINTBUF;
- int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
- printbuf_exit(&err);
-
+ int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id),
+ BCH_VALIDATE_commit);
if (invalid) {
struct printbuf buf = PRINTBUF;
@@ -269,6 +337,7 @@ restart_drop_extra_replicas:
printbuf_exit(&buf);
bch2_fatal_error(c);
+ ret = -EIO;
goto out;
}
@@ -357,17 +426,11 @@ void bch2_data_update_read_done(struct data_update *m,
void bch2_data_update_exit(struct data_update *update)
{
struct bch_fs *c = update->op.c;
- struct bkey_ptrs_c ptrs =
- bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
- if (c->opts.nocow_enabled)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(ca, ptr), 0);
- bch2_dev_put(ca);
- }
+ struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
+ if (c->opts.nocow_enabled)
+ bkey_nocow_unlock(c, k);
+ bkey_put_dev_refs(c, k);
bch2_bkey_buf_exit(&update->k, c);
bch2_disk_reservation_put(c, &update->op.res);
bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
@@ -477,6 +540,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
bch2_compression_opt_to_text(out, background_compression(*io_opts));
prt_newline(out);
+ prt_str(out, "opts.replicas:\t");
+ prt_u64(out, io_opts->data_replicas);
+
prt_str(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
}
@@ -545,7 +611,6 @@ int bch2_data_update_init(struct btree_trans *trans,
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
- unsigned ptrs_locked = 0;
int ret = 0;
/*
@@ -556,6 +621,15 @@ int bch2_data_update_init(struct btree_trans *trans,
if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
return -BCH_ERR_data_update_done;
+ if (!bkey_get_dev_refs(c, k))
+ return -BCH_ERR_data_update_done;
+
+ if (c->opts.nocow_enabled &&
+ !bkey_nocow_lock(c, ctxt, k)) {
+ bkey_put_dev_refs(c, k);
+ return -BCH_ERR_nocow_lock_blocked;
+ }
+
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
@@ -577,40 +651,24 @@ int bch2_data_update_init(struct btree_trans *trans,
m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
- bkey_for_each_ptr(ptrs, ptr) {
- if (!bch2_dev_tryget(c, ptr->dev)) {
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
- bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
- }
- return -BCH_ERR_data_update_done;
- }
- }
-
unsigned durability_have = 0, durability_removing = 0;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
- bool locked;
-
- rcu_read_lock();
- if (((1U << i) & m->data_opts.rewrite_ptrs)) {
- BUG_ON(p.ptr.cached);
-
- if (crc_is_compressed(p.crc))
- reserve_sectors += k.k->size;
-
- m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
- durability_removing += bch2_extent_ptr_desired_durability(c, &p);
- } else if (!p.ptr.cached &&
- !((1U << i) & m->data_opts.kill_ptrs)) {
- bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
- durability_have += bch2_extent_ptr_durability(c, &p);
+ if (!p.ptr.cached) {
+ rcu_read_lock();
+ if (BIT(i) & m->data_opts.rewrite_ptrs) {
+ if (crc_is_compressed(p.crc))
+ reserve_sectors += k.k->size;
+
+ m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+ durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+ } else if (!(BIT(i) & m->data_opts.kill_ptrs)) {
+ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+ durability_have += bch2_extent_ptr_durability(c, &p);
+ }
+ rcu_read_unlock();
}
- rcu_read_unlock();
/*
* op->csum_type is normally initialized from the fs/file's
@@ -625,24 +683,6 @@ int bch2_data_update_init(struct btree_trans *trans,
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
- if (c->opts.nocow_enabled) {
- if (ctxt) {
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
- bucket, 0)) ||
- list_empty(&ctxt->ios));
-
- if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- } else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
- ret = -BCH_ERR_nocow_lock_blocked;
- goto err;
- }
- }
- ptrs_locked |= (1U << i);
- }
-
i++;
}
@@ -656,16 +696,6 @@ int bch2_data_update_init(struct btree_trans *trans,
* Increasing replication is an explicit operation triggered by
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
- if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
- !durability_required) {
- m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
- m->data_opts.rewrite_ptrs = 0;
- /* if iter == NULL, it's just a promote */
- if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
- goto done;
- }
-
m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
@@ -677,48 +707,38 @@ int bch2_data_update_init(struct btree_trans *trans,
if (!(durability_have + durability_removing))
m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
- if (!m->op.nr_replicas) {
- struct printbuf buf = PRINTBUF;
+ m->op.nr_replicas_required = m->op.nr_replicas;
- bch2_data_update_to_text(&buf, m);
- WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_data_update_done;
- goto done;
+ /*
+ * It might turn out that we don't need any new replicas, if the
+ * replicas or durability settings have been changed since the extent
+ * was written:
+ */
+ if (!m->op.nr_replicas) {
+ m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+ m->data_opts.rewrite_ptrs = 0;
+ /* if iter == NULL, it's just a promote */
+ if (iter)
+ ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
+ goto out;
}
- m->op.nr_replicas_required = m->op.nr_replicas;
-
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas
? 0
: BCH_DISK_RESERVATION_NOFAIL);
if (ret)
- goto err;
+ goto out;
}
if (bkey_extent_is_unwritten(k)) {
bch2_update_unwritten_extent(trans, m);
- goto done;
+ goto out;
}
return 0;
-err:
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
- struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
- if ((1U << i) & ptrs_locked)
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- bch2_dev_put(ca);
- i++;
- }
-
- bch2_bkey_buf_exit(&m->k, c);
- bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
- return ret;
-done:
+out:
bch2_data_update_exit(m);
return ret ?: -BCH_ERR_data_update_done;
}
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index ebabab171fe5..45aec1afdb0e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -397,47 +397,27 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct btree *b;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
+ ssize_t ret = flush_buf(i);
if (ret)
return ret;
if (bpos_eq(SPOS_MAX, i->from))
return i->ret;
- trans = bch2_trans_get(i->c);
-retry:
- bch2_trans_begin(trans);
-
- for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
- bch2_btree_node_to_text(&i->buf, i->c, b);
- i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
- ? bpos_successor(b->key.k.p)
- : b->key.k.p;
-
- ret = drop_locks_do(trans, flush_buf(i));
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
-
- if (!ret)
- ret = flush_buf(i);
+ return bch2_trans_run(i->c,
+ for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
+ bch2_btree_node_to_text(&i->buf, i->c, b);
+ i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
+ ? bpos_successor(b->key.k.p)
+ : b->key.k.p;
- return ret ?: i->ret;
+ drop_locks_do(trans, flush_buf(i));
+ }))) ?: i->ret;
}
static const struct file_operations btree_format_debug_ops = {
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d743da89308e..32bfdf19289a 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -100,20 +100,19 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.is_visible = dirent_is_visible,
};
-int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
int ret = 0;
- bkey_fsck_err_on(!d_name.len, c, err,
- dirent_empty_name,
+ bkey_fsck_err_on(!d_name.len,
+ c, dirent_empty_name,
"empty name");
- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
- dirent_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
+ c, dirent_val_too_big,
"value too big (%zu > %u)",
bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
@@ -121,27 +120,27 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
* Check new keys don't exceed the max length
* (older keys may be larger.)
*/
- bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err,
- dirent_name_too_long,
+ bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
+ c, dirent_name_too_long,
"dirent name too big (%u > %u)",
d_name.len, BCH_NAME_MAX);
- bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
- dirent_name_embedded_nul,
+ bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
+ c, dirent_name_embedded_nul,
"dirent has stray data after name's NUL");
bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
- (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
- dirent_name_dot_or_dotdot,
+ (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
+ c, dirent_name_dot_or_dotdot,
"invalid name");
- bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
- dirent_name_has_slash,
+ bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
+ c, dirent_name_has_slash,
"name with /");
bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
- dirent_to_itself,
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode,
+ c, dirent_to_itself,
"dirent points to own directory");
fsck_err:
return ret;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 24037e6e0a09..8945145865c5 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -7,12 +7,11 @@
enum bch_validate_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
- .key_invalid = bch2_dirent_invalid, \
+ .key_validate = bch2_dirent_validate, \
.val_to_text = bch2_dirent_to_text, \
.min_val_size = 16, \
})
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index dcdd59249c23..e972e2bca546 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -114,11 +114,73 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
}
-int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+static inline bool is_zero(char *start, char *end)
{
- return 0;
+ BUG_ON(start > end);
+
+ for (; start < end; start++)
+ if (*start)
+ return false;
+ return true;
+}
+
+#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
+
+int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
+{
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+ void *end = &acc_k + 1;
+ int ret = 0;
+
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_nr_inodes:
+ end = field_end(acc_k, nr_inodes);
+ break;
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ end = field_end(acc_k, persistent_reserved);
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ bkey_fsck_err_on(!acc_k.replicas.nr_devs,
+ c, accounting_key_replicas_nr_devs_0,
+ "accounting key replicas entry with nr_devs=0");
+
+ bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
+ (acc_k.replicas.nr_required > 1 &&
+ acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
+ c, accounting_key_replicas_nr_required_bad,
+ "accounting key replicas entry with bad nr_required");
+
+ for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
+ bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
+ c, accounting_key_replicas_devs_unsorted,
+ "accounting key replicas entry with unsorted devs");
+
+ end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ end = field_end(acc_k, dev_data_type);
+ break;
+ case BCH_DISK_ACCOUNTING_compression:
+ end = field_end(acc_k, compression);
+ break;
+ case BCH_DISK_ACCOUNTING_snapshot:
+ end = field_end(acc_k, snapshot);
+ break;
+ case BCH_DISK_ACCOUNTING_btree:
+ end = field_end(acc_k, btree);
+ break;
+ case BCH_DISK_ACCOUNTING_rebalance_work:
+ end = field_end(acc_k, rebalance_work);
+ break;
+ }
+
+ bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
+ c, accounting_key_junk_at_end,
+ "junk at end of accounting key");
+fsck_err:
+ return ret;
}
void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
@@ -465,6 +527,9 @@ int bch2_gc_accounting_done(struct bch_fs *c)
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, e->pos);
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ continue;
+
u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
@@ -501,7 +566,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
- bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false);
+ bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false);
preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -530,7 +595,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
return 0;
percpu_down_read(&c->mark_lock);
- int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false);
+ int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true);
percpu_up_read(&c->mark_lock);
if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
@@ -697,6 +762,15 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
unsigned nr = bch2_accounting_counters(k.k);
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+ continue;
+
+ if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+ continue;
+
bch2_accounting_mem_read(c, k.k->p, v, nr);
if (memcmp(a.v->d, v, nr * sizeof(u64))) {
@@ -712,9 +786,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
mismatch = true;
}
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, a.k->p);
-
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved:
base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 3d3f25e08b69..f29fd0dd9581 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -82,14 +82,13 @@ int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
s64 *, unsigned, bool);
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
-int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_accounting_swab(struct bkey_s);
#define bch2_bkey_ops_accounting ((struct bkey_ops) { \
- .key_invalid = bch2_accounting_invalid, \
+ .key_validate = bch2_accounting_validate, \
.val_to_text = bch2_accounting_to_text, \
.swab = bch2_accounting_swab, \
.min_val_size = 8, \
@@ -107,41 +106,20 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
void bch2_accounting_mem_gc(struct bch_fs *);
-static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
-{
- struct bch_accounting_mem *acc = &c->accounting;
- unsigned idx;
-
- EBUG_ON(gc && !acc->gc_running);
-
- while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
- accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = bch2_accounting_mem_insert(c, a, gc);
- if (ret)
- return ret;
- }
-
- struct accounting_mem_entry *e = &acc->k.data[idx];
-
- EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
-
- for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
- this_cpu_add(e->v[gc][i], a.v->d[i]);
- return 0;
-}
-
/*
* Update in memory counters so they match the btree update we're doing; called
* from transaction commit path
*/
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read)
{
struct bch_fs *c = trans->c;
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, a.k->p);
- if (!gc) {
- struct disk_accounting_pos acc_k;
- bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+ if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+ return 0;
+ if (!gc && !read) {
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved:
trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
@@ -162,13 +140,31 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
}
}
- return __bch2_accounting_mem_mod(c, a, gc);
+ struct bch_accounting_mem *acc = &c->accounting;
+ unsigned idx;
+
+ EBUG_ON(gc && !acc->gc_running);
+
+ while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
+ int ret = bch2_accounting_mem_insert(c, a, gc);
+ if (ret)
+ return ret;
+ }
+
+ struct accounting_mem_entry *e = &acc->k.data[idx];
+
+ EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
+
+ for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+ this_cpu_add(e->v[gc][i], a.v->d[i]);
+ return 0;
}
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false);
percpu_up_read(&trans->c->mark_lock);
return ret;
}
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index cba417060b33..7b6e6c97e6aa 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -103,7 +103,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
x(compression, 4) \
x(snapshot, 5) \
x(btree, 6) \
- x(rebalance_work, 7)
+ x(rebalance_work, 7) \
+ x(inum, 8)
enum disk_accounting_type {
#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
@@ -124,20 +125,23 @@ struct bch_dev_data_type {
__u8 data_type;
};
-struct bch_dev_stripe_buckets {
- __u8 dev;
-};
-
struct bch_acct_compression {
__u8 type;
};
struct bch_acct_snapshot {
__u32 id;
-};
+} __packed;
struct bch_acct_btree {
__u32 id;
+} __packed;
+
+struct bch_acct_inum {
+ __u64 inum;
+} __packed;
+
+struct bch_acct_rebalance_work {
};
struct disk_accounting_pos {
@@ -149,12 +153,13 @@ struct disk_accounting_pos {
struct bch_persistent_reserved persistent_reserved;
struct bch_replicas_entry_v1 replicas;
struct bch_dev_data_type dev_data_type;
- struct bch_dev_stripe_buckets dev_stripe_buckets;
struct bch_acct_compression compression;
struct bch_acct_snapshot snapshot;
struct bch_acct_btree btree;
- };
- };
+ struct bch_acct_rebalance_work rebalance_work;
+ struct bch_acct_inum inum;
+ } __packed;
+ } __packed;
struct bpos _pad;
};
};
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9b5b5c9a6c63..141a4c63142f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -107,24 +107,23 @@ struct ec_bio {
/* Stripes btree keys: */
-int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
int ret = 0;
bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
- bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
- stripe_pos_bad,
+ bpos_gt(k.k->p, POS(0, U32_MAX)),
+ c, stripe_pos_bad,
"stripe at bad pos");
- bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
- stripe_val_size_bad,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
+ c, stripe_val_size_bad,
"incorrect value size (%zu < %u)",
bkey_val_u64s(k.k), stripe_val_u64s(s));
- ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_validate(c, k, flags);
fsck_err:
return ret;
}
@@ -1809,6 +1808,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
BUG_ON(v->nr_redundant != h->s->nr_parity);
+ /* * We bypass the sector allocator which normally does this: */
+ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
__clear_bit(v->ptrs[i].dev, devs.d);
if (i < h->s->nr_data)
@@ -2235,6 +2237,23 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
mutex_unlock(&c->ec_stripes_heap_lock);
}
+static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+ struct ec_stripe_new *s)
+{
+ prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
+ s->idx, s->nr_data, s->nr_parity,
+ bitmap_weight(s->blocks_allocated, s->nr_data),
+ atomic_read(&s->ref[STRIPE_REF_io]),
+ atomic_read(&s->ref[STRIPE_REF_stripe]),
+ bch2_watermarks[s->h->watermark]);
+
+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+ unsigned i;
+ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
+ prt_printf(out, " %u", s->blocks[i]);
+ prt_newline(out);
+}
+
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
{
struct ec_stripe_head *h;
@@ -2247,23 +2266,15 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
bch2_watermarks[h->watermark]);
if (h->s)
- prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
- h->s->idx, h->s->nr_data, h->s->nr_parity,
- bitmap_weight(h->s->blocks_allocated,
- h->s->nr_data));
+ bch2_new_stripe_to_text(out, c, h->s);
}
mutex_unlock(&c->ec_stripe_head_lock);
prt_printf(out, "in flight:\n");
mutex_lock(&c->ec_stripe_new_lock);
- list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
- s->idx, s->nr_data, s->nr_parity,
- atomic_read(&s->ref[STRIPE_REF_io]),
- atomic_read(&s->ref[STRIPE_REF_stripe]),
- bch2_watermarks[s->h->watermark]);
- }
+ list_for_each_entry(s, &c->ec_stripe_new_list, list)
+ bch2_new_stripe_to_text(out, c, s);
mutex_unlock(&c->ec_stripe_new_lock);
}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 84a23eeb6249..9baf3411a8f9 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -8,8 +8,7 @@
enum bch_validate_flags;
-int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
@@ -17,7 +16,7 @@ int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
- .key_invalid = bch2_stripe_invalid, \
+ .key_validate = bch2_stripe_validate, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
.trigger = bch2_trigger_stripe, \
@@ -98,7 +97,9 @@ static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe
const struct bch_extent_ptr *data_ptr,
unsigned sectors)
{
- return data_ptr->dev == stripe_ptr->dev &&
+ return (data_ptr->dev == stripe_ptr->dev ||
+ data_ptr->dev == BCH_SB_MEMBER_INVALID ||
+ stripe_ptr->dev == BCH_SB_MEMBER_INVALID) &&
data_ptr->gen == stripe_ptr->gen &&
data_ptr->offset >= stripe_ptr->offset &&
data_ptr->offset < stripe_ptr->offset + sectors;
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a268af3e52bf..742dcdd3e5d7 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -166,6 +166,7 @@
x(0, journal_reclaim_would_deadlock) \
x(EINVAL, fsck) \
x(BCH_ERR_fsck, fsck_fix) \
+ x(BCH_ERR_fsck, fsck_delete_bkey) \
x(BCH_ERR_fsck, fsck_ignore) \
x(BCH_ERR_fsck, fsck_errors_not_fixed) \
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
@@ -256,7 +257,6 @@
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
- x(0, need_inode_lock) \
x(0, invalid_snapshot_node) \
x(0, option_needs_open_fs)
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index a62b63108820..95afa7bf2020 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -416,6 +416,28 @@ err:
return ret;
}
+int __bch2_bkey_fsck_err(struct bch_fs *c,
+ struct bkey_s_c k,
+ enum bch_fsck_flags flags,
+ enum bch_sb_error_id err,
+ const char *fmt, ...)
+{
+ struct printbuf buf = PRINTBUF;
+ va_list args;
+
+ prt_str(&buf, "invalid bkey ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\n ");
+ va_start(args, fmt);
+ prt_vprintf(&buf, fmt, args);
+ va_end(args);
+ prt_str(&buf, ": delete?");
+
+ int ret = __bch2_fsck_err(c, NULL, flags, err, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
void bch2_flush_fsck_errs(struct bch_fs *c)
{
struct fsck_err_state *s, *n;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 995e6bba9bad..2f1b86978f36 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -4,6 +4,7 @@
#include <linux/list.h>
#include <linux/printk.h>
+#include "bkey_types.h"
#include "sb-errors.h"
struct bch_dev;
@@ -166,24 +167,30 @@ void bch2_flush_fsck_errs(struct bch_fs *);
#define fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-__printf(4, 0)
-static inline void bch2_bkey_fsck_err(struct bch_fs *c,
- struct printbuf *err_msg,
- enum bch_sb_error_id err_type,
- const char *fmt, ...)
-{
- va_list args;
+__printf(5, 6)
+int __bch2_bkey_fsck_err(struct bch_fs *,
+ struct bkey_s_c,
+ enum bch_fsck_flags,
+ enum bch_sb_error_id,
+ const char *, ...);
- va_start(args, fmt);
- prt_vprintf(err_msg, fmt, args);
- va_end(args);
-}
-
-#define bkey_fsck_err(c, _err_msg, _err_type, ...) \
+/*
+ * for now, bkey fsck errors are always handled by deleting the entire key -
+ * this will change at some point
+ */
+#define bkey_fsck_err(c, _err_type, _err_msg, ...) \
do { \
- prt_printf(_err_msg, __VA_ARGS__); \
- bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \
- ret = -BCH_ERR_invalid_bkey; \
+ if ((flags & BCH_VALIDATE_silent)) { \
+ ret = -BCH_ERR_fsck_delete_bkey; \
+ goto fsck_err; \
+ } \
+ int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \
+ BCH_FSCK_ERR_##_err_type, \
+ _err_msg, ##__VA_ARGS__); \
+ if (_ret != -BCH_ERR_fsck_fix && \
+ _ret != -BCH_ERR_fsck_ignore) \
+ ret = _ret; \
+ ret = -BCH_ERR_fsck_delete_bkey; \
goto fsck_err; \
} while (0)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 07973198e35f..324303bf4353 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -171,17 +171,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
- btree_ptr_val_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
+ c, btree_ptr_val_too_big,
"value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_validate(c, k, flags);
fsck_err:
return ret;
}
@@ -192,28 +191,27 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
int ret = 0;
bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
- c, err, btree_ptr_v2_val_too_big,
+ c, btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
- c, err, btree_ptr_v2_min_key_bad,
+ c, btree_ptr_v2_min_key_bad,
"min_key > key");
if (flags & BCH_VALIDATE_write)
bkey_fsck_err_on(!bp.v->sectors_written,
- c, err, btree_ptr_v2_written_0,
+ c, btree_ptr_v2_written_0,
"sectors_written == 0");
- ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_validate(c, k, flags);
fsck_err:
return ret;
}
@@ -399,15 +397,14 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
int ret = 0;
- bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
- reservation_key_nr_replicas_invalid,
+ bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
+ c, reservation_key_nr_replicas_invalid,
"invalid nr_replicas (%u)", r.v->nr_replicas);
fsck_err:
return ret;
@@ -784,14 +781,17 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
/*
* Returns pointer to the next entry after the one being dropped:
*/
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
- union bch_extent_entry *ret = entry;
bool drop_crc = true;
+ if (k.k->type == KEY_TYPE_stripe) {
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ return;
+ }
+
EBUG_ON(ptr < &ptrs.start->ptr ||
ptr >= &ptrs.end->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
@@ -814,21 +814,16 @@ union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
break;
if ((extent_entry_is_crc(entry) && drop_crc) ||
- extent_entry_is_stripe_ptr(entry)) {
- ret = (void *) ret - extent_entry_bytes(entry);
+ extent_entry_is_stripe_ptr(entry))
extent_entry_drop(k, entry);
- }
}
-
- return ret;
}
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
{
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
- union bch_extent_entry *ret =
- bch2_bkey_drop_ptr_noerror(k, ptr);
+
+ bch2_bkey_drop_ptr_noerror(k, ptr);
/*
* If we deleted all the dirty pointers and there's still cached
@@ -840,14 +835,10 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
!bch2_bkey_dirty_devs(k.s_c).nr) {
k.k->type = KEY_TYPE_error;
set_bkey_val_u64s(k.k, 0);
- ret = NULL;
} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
k.k->type = KEY_TYPE_deleted;
set_bkey_val_u64s(k.k, 0);
- ret = NULL;
}
-
- return ret;
}
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
@@ -932,8 +923,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
if (p1.ptr.dev == p2.ptr.dev &&
p1.ptr.gen == p2.ptr.gen &&
+
+ /*
+ * This checks that the two pointers point
+ * to the same region on disk - adjusting
+ * for the difference in where the extents
+ * start, since one may have been trimmed:
+ */
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
+
+ /*
+ * This additionally checks that the
+ * extents overlap on disk, since the
+ * previous check may trigger spuriously
+ * when one extent is immediately partially
+ * overwritten with another extent (so that
+ * on disk they are adjacent) and
+ * compression is in use:
+ */
+ ((p1.ptr.offset >= p2.ptr.offset &&
+ p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
+ (p2.ptr.offset >= p1.ptr.offset &&
+ p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
return true;
return false;
@@ -1020,6 +1032,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
prt_printf(out, "ptr: %u:%llu:%u gen %u",
ptr->dev, b, offset, ptr->gen);
+ if (ca->mi.durability != 1)
+ prt_printf(out, " d=%u", ca->mi.durability);
if (ptr->cached)
prt_str(out, " cached");
if (ptr->unwritten)
@@ -1102,14 +1116,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
-
-static int extent_ptr_invalid(struct bch_fs *c,
- struct bkey_s_c k,
- enum bch_validate_flags flags,
- const struct bch_extent_ptr *ptr,
- unsigned size_ondisk,
- bool metadata,
- struct printbuf *err)
+static int extent_ptr_validate(struct bch_fs *c,
+ struct bkey_s_c k,
+ enum bch_validate_flags flags,
+ const struct bch_extent_ptr *ptr,
+ unsigned size_ondisk,
+ bool metadata)
{
int ret = 0;
@@ -1128,28 +1140,27 @@ static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr2)
- bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
- ptr_to_duplicate_device,
+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
+ c, ptr_to_duplicate_device,
"multiple pointers to same device (%u)", ptr->dev);
- bkey_fsck_err_on(bucket >= nbuckets, c, err,
- ptr_after_last_bucket,
+ bkey_fsck_err_on(bucket >= nbuckets,
+ c, ptr_after_last_bucket,
"pointer past last bucket (%llu > %llu)", bucket, nbuckets);
- bkey_fsck_err_on(bucket < first_bucket, c, err,
- ptr_before_first_bucket,
+ bkey_fsck_err_on(bucket < first_bucket,
+ c, ptr_before_first_bucket,
"pointer before first bucket (%llu < %u)", bucket, first_bucket);
- bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err,
- ptr_spans_multiple_buckets,
+ bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
+ c, ptr_spans_multiple_buckets,
"pointer spans multiple buckets (%u + %u > %u)",
bucket_offset, size_ondisk, bucket_size);
fsck_err:
return ret;
}
-int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1164,25 +1175,24 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
- bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
- extent_ptrs_invalid_entry,
- "invalid extent entry type (got %u, max %u)",
- __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+ bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
+ c, extent_ptrs_invalid_entry,
+ "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
- !extent_entry_is_ptr(entry), c, err,
- btree_ptr_has_non_ptr,
+ !extent_entry_is_ptr(entry),
+ c, btree_ptr_has_non_ptr,
"has non ptr field");
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
- ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
- size_ondisk, false, err);
+ ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false);
if (ret)
return ret;
- bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
- ptr_cached_and_erasure_coded,
+ bkey_fsck_err_on(entry->ptr.cached && have_ec,
+ c, ptr_cached_and_erasure_coded,
"cached, erasure coded ptr");
if (!entry->ptr.unwritten)
@@ -1199,44 +1209,50 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
- ptr_crc_uncompressed_size_too_small,
+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
+ c, ptr_crc_uncompressed_size_too_small,
"checksum offset + key size > uncompressed size");
- bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
- ptr_crc_csum_type_unknown,
+ bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
+ c, ptr_crc_csum_type_unknown,
"invalid checksum type");
- bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
- ptr_crc_compression_type_unknown,
+ bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
+ c, ptr_crc_compression_type_unknown,
"invalid compression type");
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
else if (nonce != crc.offset + crc.nonce)
- bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+ bkey_fsck_err(c, ptr_crc_nonce_mismatch,
"incorrect nonce");
}
- bkey_fsck_err_on(crc_since_last_ptr, c, err,
- ptr_crc_redundant,
+ bkey_fsck_err_on(crc_since_last_ptr,
+ c, ptr_crc_redundant,
"redundant crc entry");
crc_since_last_ptr = true;
bkey_fsck_err_on(crc_is_encoded(crc) &&
(crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
- (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err,
- ptr_crc_uncompressed_size_too_big,
+ (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
+ c, ptr_crc_uncompressed_size_too_big,
"too large encoded extent");
size_ondisk = crc.compressed_size;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
- bkey_fsck_err_on(have_ec, c, err,
- ptr_stripe_redundant,
+ bkey_fsck_err_on(have_ec,
+ c, ptr_stripe_redundant,
"redundant stripe entry");
have_ec = true;
break;
case BCH_EXTENT_ENTRY_rebalance: {
+ /*
+ * this shouldn't be a fsck error, for forward
+ * compatibility; the rebalance code should just refetch
+ * the compression opt if it's unknown
+ */
+#if 0
const struct bch_extent_rebalance *r = &entry->rebalance;
if (!bch2_compression_opt_valid(r->compression)) {
@@ -1245,28 +1261,29 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
opt.type, opt.level);
return -BCH_ERR_invalid_bkey;
}
+#endif
break;
}
}
}
- bkey_fsck_err_on(!nr_ptrs, c, err,
- extent_ptrs_no_ptrs,
+ bkey_fsck_err_on(!nr_ptrs,
+ c, extent_ptrs_no_ptrs,
"no ptrs");
- bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
- extent_ptrs_too_many_ptrs,
+ bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
+ c, extent_ptrs_too_many_ptrs,
"too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
- bkey_fsck_err_on(have_written && have_unwritten, c, err,
- extent_ptrs_written_and_unwritten,
+ bkey_fsck_err_on(have_written && have_unwritten,
+ c, extent_ptrs_written_and_unwritten,
"extent with unwritten and written ptrs");
- bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
- extent_ptrs_unwritten,
+ bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
+ c, extent_ptrs_unwritten,
"has unwritten ptrs");
- bkey_fsck_err_on(crc_since_last_ptr, c, err,
- extent_ptrs_redundant_crc,
+ bkey_fsck_err_on(crc_since_last_ptr,
+ c, extent_ptrs_redundant_crc,
"redundant crc entry");
- bkey_fsck_err_on(have_ec, c, err,
- extent_ptrs_redundant_stripe,
+ bkey_fsck_err_on(have_ec,
+ c, extent_ptrs_redundant_stripe,
"redundant stripe entry");
fsck_err:
return ret;
@@ -1377,6 +1394,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
return r != NULL;
}
+static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ u64 sectors = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+ p.ptr.unwritten) {
+ sectors = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ sectors += p.crc.compressed_size;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
+ sectors += p.crc.compressed_size;
+ }
+
+ return sectors;
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
+}
+
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
struct bch_io_opts *opts)
{
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index facdb8a86eec..42a7c6d820a0 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -409,26 +409,26 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
int, struct bkey_s);
#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \
- .key_invalid = bch2_btree_ptr_invalid, \
+ .key_validate = bch2_btree_ptr_validate, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
.trigger = bch2_trigger_extent, \
})
#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
- .key_invalid = bch2_btree_ptr_v2_invalid, \
+ .key_validate = bch2_btree_ptr_v2_validate, \
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
@@ -441,7 +441,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_extent ((struct bkey_ops) { \
- .key_invalid = bch2_bkey_ptrs_invalid, \
+ .key_validate = bch2_bkey_ptrs_validate, \
.val_to_text = bch2_bkey_ptrs_to_text, \
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
@@ -451,13 +451,13 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
#define bch2_bkey_ops_reservation ((struct bkey_ops) { \
- .key_invalid = bch2_reservation_invalid, \
+ .key_validate = bch2_reservation_validate, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
.trigger = bch2_trigger_reservation, \
@@ -649,26 +649,21 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
- struct bch_extent_ptr *);
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
- struct bch_extent_ptr *);
+void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
+void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
do { \
- struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
+ __label__ _again; \
+ struct bkey_ptrs _ptrs; \
+_again: \
+ _ptrs = bch2_bkey_ptrs(_k); \
\
- struct bch_extent_ptr *_ptr = &_ptrs.start->ptr; \
- \
- while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
+ bkey_for_each_ptr(_ptrs, _ptr) \
if (_cond) { \
- _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \
- _ptrs = bch2_bkey_ptrs(_k); \
- continue; \
+ bch2_bkey_drop_ptr(_k, _ptr); \
+ goto _again; \
} \
- \
- (_ptr)++; \
- } \
} while (0)
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
@@ -683,8 +678,8 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_ptr_swab(struct bkey_s);
@@ -692,6 +687,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
unsigned, unsigned);
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
struct bch_io_opts *);
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index cc33d763f722..ff60c041abe5 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -534,7 +534,7 @@ do_io:
if (f_sectors > w->tmp_sectors) {
kfree(w->tmp);
- w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+ w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
w->tmp_sectors = f_sectors;
}
@@ -659,7 +659,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
int bch2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -728,12 +728,11 @@ out:
goto err;
}
- *pagep = &folio->page;
+ *foliop = folio;
return 0;
err:
folio_unlock(folio);
folio_put(folio);
- *pagep = NULL;
err_unlock:
bch2_pagecache_add_put(inode);
kfree(res);
@@ -743,12 +742,11 @@ err_unlock:
int bch2_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation *res = fsdata;
- struct folio *folio = page_folio(page);
unsigned offset = pos - folio_pos(folio);
lockdep_assert_held(&inode->v.i_rwsem);
@@ -802,8 +800,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
- loff_t pos, unsigned len,
- bool inode_locked)
+ loff_t pos, unsigned len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@@ -827,15 +824,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
- /*
- * If we're not using the inode lock, we need to lock all the folios for
- * atomiticity of writes vs. other writes:
- */
- if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
- ret = -BCH_ERR_need_inode_lock;
- goto out;
- }
-
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@@ -932,10 +920,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size) {
- BUG_ON(!inode_locked);
+ if (end > inode->v.i_size)
i_size_write(&inode->v, end);
- }
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@@ -979,68 +965,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos;
- bool inode_locked = false;
- ssize_t written = 0, written2 = 0, ret = 0;
-
- /*
- * We don't take the inode lock unless i_size will be changing. Folio
- * locks provide exclusion with other writes, and the pagecache add lock
- * provides exclusion with truncate and hole punching.
- *
- * There is one nasty corner case where atomicity would be broken
- * without great care: when copying data from userspace to the page
- * cache, we do that with faults disable - a page fault would recurse
- * back into the filesystem, taking filesystem locks again, and
- * deadlock; so it's done with faults disabled, and we fault in the user
- * buffer when we aren't holding locks.
- *
- * If we do part of the write, but we then race and in the userspace
- * buffer have been evicted and are no longer resident, then we have to
- * drop our folio locks to re-fault them in, breaking write atomicity.
- *
- * To fix this, we restart the write from the start, if we weren't
- * holding the inode lock.
- *
- * There is another wrinkle after that; if we restart the write from the
- * start, and then get an unrecoverable error, we _cannot_ claim to
- * userspace that we did not write data we actually did - so we must
- * track (written2) the most we ever wrote.
- */
-
- if ((iocb->ki_flags & IOCB_APPEND) ||
- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
- inode_lock(&inode->v);
- inode_locked = true;
- }
-
- ret = generic_write_checks(iocb, iter);
- if (ret <= 0)
- goto unlock;
-
- ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
- if (ret) {
- if (!inode_locked) {
- inode_lock(&inode->v);
- inode_locked = true;
- ret = file_remove_privs_flags(file, 0);
- }
- if (ret)
- goto unlock;
- }
-
- ret = file_update_time(file);
- if (ret)
- goto unlock;
-
- pos = iocb->ki_pos;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ int ret = 0;
bch2_pagecache_add_get(inode);
- if (!inode_locked &&
- (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
- goto get_inode_lock;
-
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@@ -1065,17 +995,12 @@ again:
}
}
- if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
- goto get_inode_lock;
-
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
- if (ret == -BCH_ERR_need_inode_lock)
- goto get_inode_lock;
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
if (unlikely(ret < 0))
break;
@@ -1096,46 +1021,50 @@ again:
}
pos += ret;
written += ret;
- written2 = max(written, written2);
-
- if (ret != bytes && !inode_locked)
- goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
-
- if (0) {
-get_inode_lock:
- bch2_pagecache_add_put(inode);
- inode_lock(&inode->v);
- inode_locked = true;
- bch2_pagecache_add_get(inode);
-
- iov_iter_revert(iter, written);
- pos -= written;
- written = 0;
- ret = 0;
- }
} while (iov_iter_count(iter));
- bch2_pagecache_add_put(inode);
-unlock:
- if (inode_locked)
- inode_unlock(&inode->v);
- iocb->ki_pos += written;
+ bch2_pagecache_add_put(inode);
- ret = max(written, written2) ?: ret;
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
- return ret;
+ return written ? written : ret;
}
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- ssize_t ret = iocb->ki_flags & IOCB_DIRECT
- ? bch2_direct_write(iocb, iter)
- : bch2_buffered_write(iocb, iter);
+ struct file *file = iocb->ki_filp;
+ struct bch_inode_info *inode = file_bch_inode(file);
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = bch2_direct_write(iocb, from);
+ goto out;
+ }
+
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+
+ ret = file_remove_privs(file);
+ if (ret)
+ goto unlock;
+
+ ret = file_update_time(file);
+ if (ret)
+ goto unlock;
+
+ ret = bch2_buffered_write(iocb, from);
+ if (likely(ret > 0))
+ iocb->ki_pos += ret;
+unlock:
+ inode_unlock(&inode->v);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+out:
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
index a6126ff790e6..3207ebbb4ab4 100644
--- a/fs/bcachefs/fs-io-buffered.h
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -10,10 +10,10 @@ int bch2_read_folio(struct file *, struct folio *);
int bch2_writepages(struct address_space *, struct writeback_control *);
void bch2_readahead(struct readahead_control *);
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
- unsigned, struct page **, void **);
+int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
+ unsigned len, struct folio **, void **);
int bch2_write_end(struct file *, struct address_space *, loff_t,
- unsigned, unsigned, struct page *, void *);
+ unsigned len, unsigned copied, struct folio *, void *);
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index aea8132d2c40..99c7fe987c74 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c,
mutex_lock(&c->sb_lock);
strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
- mutex_unlock(&c->sb_lock);
-
ret = bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
mnt_drop_write_file(file);
return ret;
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3a5f49affa0a..011817afc3ad 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -177,6 +177,14 @@ static unsigned bch2_inode_hash(subvol_inum inum)
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
}
+struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+{
+ return to_bch_ei(ilookup5_nowait(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ &inum));
+}
+
static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
subvol_inum inum = inode_inum(inode);
@@ -193,7 +201,7 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
* only insert fully created inodes in the inode hash table. But
* discard_new_inode() expects it to be set...
*/
- inode->v.i_flags |= I_NEW;
+ inode->v.i_state |= I_NEW;
/*
* We don't want bch2_evict_inode() to delete the inode on disk,
* we just raced and had another inode in cache. Normally new
@@ -1199,7 +1207,7 @@ static const struct inode_operations bch_file_inode_operations = {
.fiemap = bch2_fiemap,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1219,7 +1227,7 @@ static const struct inode_operations bch_dir_inode_operations = {
.tmpfile = bch2_tmpfile,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1241,7 +1249,7 @@ static const struct inode_operations bch_symlink_inode_operations = {
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1251,7 +1259,7 @@ static const struct inode_operations bch_special_inode_operations = {
.setattr = bch2_setattr,
.listxattr = bch2_xattr_list,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
- .get_acl = bch2_get_acl,
+ .get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
};
@@ -1644,14 +1652,16 @@ again:
break;
}
} else if (clean_pass && this_pass_clean) {
- wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
- DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
+ prepare_to_wait_event(wq_head, &wqe.wq_entry,
+ TASK_UNINTERRUPTIBLE);
mutex_unlock(&c->vfs_inodes_lock);
schedule();
- finish_wait(wq, &wait.wq_entry);
+ finish_wait(wq_head, &wqe.wq_entry);
goto again;
}
}
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index c3af7225ff69..990ec43e0365 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -56,6 +56,8 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
};
}
+struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
+
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
@@ -194,6 +196,11 @@ int bch2_vfs_init(void);
#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
+static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+{
+ return NULL;
+}
+
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
static inline void bch2_vfs_exit(void) {}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9138944c5ae6..9b3470a97546 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -8,6 +8,7 @@
#include "darray.h"
#include "dirent.h"
#include "error.h"
+#include "fs.h"
#include "fs-common.h"
#include "fsck.h"
#include "inode.h"
@@ -962,6 +963,22 @@ fsck_err:
return ret;
}
+static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
+{
+ subvol_inum inum = {
+ .subvol = snapshot_t(c, p.snapshot)->subvol,
+ .inum = p.offset,
+ };
+
+ /* snapshot tree corruption, can't safely delete */
+ if (!inum.subvol) {
+ bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
+ return true;
+ }
+
+ return __bch2_inode_hash_find(c, inum) != NULL;
+}
+
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1040,6 +1057,7 @@ static int check_inode(struct btree_trans *trans,
}
if (u.bi_flags & BCH_INODE_unlinked &&
+ !bch2_inode_open(c, k.k->p) &&
(!c->sb.clean ||
fsck_err(trans, inode_unlinked_but_clean,
"filesystem marked clean, but inode %llu unlinked",
@@ -2006,7 +2024,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
ret = -BCH_ERR_fsck_repair_unimplemented;
- ret = 0;
goto err;
}
@@ -2216,6 +2233,8 @@ int bch2_check_xattrs(struct bch_fs *c)
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
+
+ inode_walker_exit(&inode);
bch_err_fn(c, ret);
return ret;
}
@@ -2469,8 +2488,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
: bch2_inode_unpack(inode_k, &inode);
if (ret) {
/* Should have been caught in dirents pass */
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "error looking up parent directory: %i", ret);
+ bch_err_msg(c, ret, "error looking up parent directory");
break;
}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 1e20020eadd1..2be6be33afa3 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -434,100 +434,98 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
return &inode_p->inode.k_i;
}
-static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bch_inode_unpacked unpacked;
int ret = 0;
- bkey_fsck_err_on(k.k->p.inode, c, err,
- inode_pos_inode_nonzero,
+ bkey_fsck_err_on(k.k->p.inode,
+ c, inode_pos_inode_nonzero,
"nonzero k.p.inode");
- bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
- inode_pos_blockdev_range,
+ bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
+ c, inode_pos_blockdev_range,
"fs inode in blockdev range");
- bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
- inode_unpack_error,
+ bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
+ c, inode_unpack_error,
"invalid variable length fields");
- bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
- inode_checksum_type_invalid,
+ bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
+ c, inode_checksum_type_invalid,
"invalid data checksum type (%u >= %u",
unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
bkey_fsck_err_on(unpacked.bi_compression &&
- !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
- inode_compression_type_invalid,
+ !bch2_compression_opt_valid(unpacked.bi_compression - 1),
+ c, inode_compression_type_invalid,
"invalid compression opt %u", unpacked.bi_compression - 1);
bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
- unpacked.bi_nlink != 0, c, err,
- inode_unlinked_but_nlink_nonzero,
+ unpacked.bi_nlink != 0,
+ c, inode_unlinked_but_nlink_nonzero,
"flagged as unlinked but bi_nlink != 0");
- bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
- inode_subvol_root_but_not_dir,
+ bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
+ c, inode_subvol_root_but_not_dir,
"subvolume root but not a directory");
fsck_err:
return ret;
}
-int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
int ret = 0;
- bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
- inode_str_hash_invalid,
+ bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_invalid(c, k, err);
+ ret = __bch2_inode_validate(c, k, flags);
fsck_err:
return ret;
}
-int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
int ret = 0;
- bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
- inode_str_hash_invalid,
+ bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_invalid(c, k, err);
+ ret = __bch2_inode_validate(c, k, flags);
fsck_err:
return ret;
}
-int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
int ret = 0;
bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
- INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
- inode_v3_fields_start_bad,
+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
+ c, inode_v3_fields_start_bad,
"invalid fields_start (got %llu, min %u max %zu)",
INODEv3_FIELDS_START(inode.v),
INODEv3_FIELDS_START_INITIAL,
bkey_val_u64s(inode.k));
- bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
- inode_str_hash_invalid,
+ bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
- ret = __bch2_inode_invalid(c, k, err);
+ ret = __bch2_inode_validate(c, k, flags);
fsck_err:
return ret;
}
@@ -625,14 +623,13 @@ int bch2_trigger_inode(struct btree_trans *trans,
return 0;
}
-int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
int ret = 0;
- bkey_fsck_err_on(k.k->p.inode, c, err,
- inode_pos_inode_nonzero,
+ bkey_fsck_err_on(k.k->p.inode,
+ c, inode_pos_inode_nonzero,
"nonzero k.p.inode");
fsck_err:
return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index da0e4a745099..f1fcb4c58039 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -9,12 +9,12 @@
enum bch_validate_flags;
extern const char * const bch2_inode_opts[];
-int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
+int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
+int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
@@ -22,21 +22,21 @@ int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_inode ((struct bkey_ops) { \
- .key_invalid = bch2_inode_invalid, \
+ .key_validate = bch2_inode_validate, \
.val_to_text = bch2_inode_to_text, \
.trigger = bch2_trigger_inode, \
.min_val_size = 16, \
})
#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
- .key_invalid = bch2_inode_v2_invalid, \
+ .key_validate = bch2_inode_v2_validate, \
.val_to_text = bch2_inode_to_text, \
.trigger = bch2_trigger_inode, \
.min_val_size = 32, \
})
#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
- .key_invalid = bch2_inode_v3_invalid, \
+ .key_validate = bch2_inode_v3_validate, \
.val_to_text = bch2_inode_to_text, \
.trigger = bch2_trigger_inode, \
.min_val_size = 48, \
@@ -49,12 +49,12 @@ static inline bool bkey_is_inode(const struct bkey *k)
k->type == KEY_TYPE_inode_v3;
}
-int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \
- .key_invalid = bch2_inode_generation_invalid, \
+ .key_validate = bch2_inode_generation_validate, \
.val_to_text = bch2_inode_generation_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 2cf6297756f8..177ed331c00b 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -126,11 +126,7 @@ err_noprint:
if (closure_nr_remaining(&cl) != 1) {
bch2_trans_unlock_long(trans);
-
- if (closure_sync_timeout(&cl, HZ * 10)) {
- bch2_print_allocator_stuck(c);
- closure_sync(&cl);
- }
+ bch2_wait_on_allocator(c, &cl);
}
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 4531c9ab3e12..7ee3b75480df 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -406,6 +406,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
bch2_trans_iter_init(trans, &iter, rbio->data_btree,
rbio->read_pos, BTREE_ITER_slots);
retry:
+ bch2_trans_begin(trans);
rbio->bio.bi_status = 0;
k = bch2_btree_iter_peek_slot(&iter);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index d31c8d006d97..1d4761d15002 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1503,10 +1503,7 @@ err:
if ((op->flags & BCH_WRITE_SYNC) ||
(!(op->flags & BCH_WRITE_SUBMITTED) &&
!(op->flags & BCH_WRITE_IN_WORKER))) {
- if (closure_sync_timeout(&op->cl, HZ * 10)) {
- bch2_print_allocator_stuck(c);
- closure_sync(&op->cl);
- }
+ bch2_wait_on_allocator(c, &op->cl);
__bch2_write_index(op);
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 649e3a01608a..f5f7db50ca31 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
}
if (!had_entries)
- j->last_empty_seq = cur_seq;
+ j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7a833a3f1c63..7664b68e6a15 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -332,7 +332,6 @@ static int journal_validate_key(struct bch_fs *c,
{
int write = flags & BCH_VALIDATE_write;
void *next = vstruct_next(entry);
- struct printbuf buf = PRINTBUF;
int ret = 0;
if (journal_entry_err_on(!k->k.u64s,
@@ -368,34 +367,21 @@ static int journal_validate_key(struct bch_fs *c,
bch2_bkey_compat(level, btree_id, version, big_endian,
write, NULL, bkey_to_packed(k));
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
- __btree_node_type(level, btree_id), write, &buf)) {
- printbuf_reset(&buf);
- journal_entry_err_msg(&buf, version, jset, entry);
- prt_newline(&buf);
- printbuf_indent_add(&buf, 2);
-
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- prt_newline(&buf);
- bch2_bkey_invalid(c, bkey_i_to_s_c(k),
- __btree_node_type(level, btree_id), write, &buf);
-
- mustfix_fsck_err(c, journal_entry_bkey_invalid,
- "%s", buf.buf);
-
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id), write);
+ if (ret == -BCH_ERR_fsck_delete_bkey) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
-
- printbuf_exit(&buf);
return FSCK_DELETED_KEY;
}
+ if (ret)
+ goto fsck_err;
if (write)
bch2_bkey_compat(level, btree_id, version, big_endian,
write, NULL, bkey_to_packed(k));
fsck_err:
- printbuf_exit(&buf);
return ret;
}
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index db80e506e3ab..62b910f2fb27 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
int ret = -BCH_ERR_invalid_sb_journal;
+ u64 sum = 0;
unsigned nr;
unsigned i;
struct u64_range *b;
@@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
for (i = 0; i < nr; i++) {
b[i].start = le64_to_cpu(journal->d[i].start);
b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+
+ if (b[i].end <= b[i].start) {
+ prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
+ le64_to_cpu(journal->d[i].start),
+ le64_to_cpu(journal->d[i].nr));
+ goto err;
+ }
+
+ sum += le64_to_cpu(journal->d[i].nr);
}
sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
@@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
}
}
+ if (sum > UINT_MAX) {
+ prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
+ goto err;
+ }
+
ret = 0;
err:
kfree(b);
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 83b1586cb371..96f2f4f8c397 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -10,14 +10,13 @@
#include "recovery.h"
/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
int ret = 0;
- bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
- lru_entry_at_time_0,
+ bkey_fsck_err_on(!lru_pos_time(k.k->p),
+ c, lru_entry_at_time_0,
"lru entry at time=0");
fsck_err:
return ret;
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 5bd8974a7f11..e6a7d8241bb8 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -33,14 +33,13 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
return BCH_LRU_read;
}
-int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
#define bch2_bkey_ops_lru ((struct bkey_ops) { \
- .key_invalid = bch2_lru_invalid, \
+ .key_validate = bch2_lru_validate, \
.val_to_text = bch2_lru_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index deef4f024d20..d86565bf07c8 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
- bch2_trans_unlock_long(ctxt.trans);
+ move_buckets_wait(&ctxt, buckets, true);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 60b93018501f..cda1725702ea 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -391,6 +391,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
NULL, "Log transaction function names in journal") \
+ x(allocator_stuck_timeout, u16, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, U16_MAX), \
+ BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \
+ NULL, "Default timeout in seconds for stuck allocator messages")\
x(noexcl, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index a0cca8b70e0a..c32a05e252e2 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -59,13 +59,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
.to_text = bch2_sb_quota_to_text,
};
-int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err)
+int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
int ret = 0;
- bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
- quota_type_invalid,
+ bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
+ c, quota_type_invalid,
"invalid quota type (%llu >= %u)",
k.k->p.inode, QTYP_NR);
fsck_err:
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 02d37a332218..a62abcc5332a 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -8,12 +8,11 @@
enum bch_validate_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_quota ((struct bkey_ops) { \
- .key_invalid = bch2_quota_invalid, \
+ .key_validate = bch2_quota_validate, \
.val_to_text = bch2_quota_to_text, \
.min_val_size = 32, \
})
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d89eb43c5ce9..36de1c6fe8c3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
- return cmp_int(l->journal_seq, r->journal_seq);
+ /*
+ * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
+ *
+ * journal_seq == 0 means that the key comes from early repair, and
+ * should be inserted last so as to avoid overflowing the journal
+ */
+ return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
}
int bch2_journal_replay(struct bch_fs *c)
@@ -322,6 +328,7 @@ int bch2_journal_replay(struct bch_fs *c)
}
}
+ bch2_trans_unlock_long(trans);
/*
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 5f92715e1525..e59c0abb4772 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -29,15 +29,14 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
-int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
int ret = 0;
bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
- c, err, reflink_p_front_pad_bad,
+ c, reflink_p_front_pad_bad,
"idx < front_pad (%llu < %u)",
le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
fsck_err:
@@ -256,11 +255,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
/* indirect extents */
-int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
+ return bch2_bkey_ptrs_validate(c, k, flags);
}
void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -311,9 +309,8 @@ int bch2_trigger_reflink_v(struct btree_trans *trans,
/* indirect inline data */
-int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
return 0;
}
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index e894f3a2c67a..51afe11d8ed6 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -4,41 +4,37 @@
enum bch_validate_flags;
-int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
- .key_invalid = bch2_reflink_p_invalid, \
+ .key_validate = bch2_reflink_p_validate, \
.val_to_text = bch2_reflink_p_to_text, \
.key_merge = bch2_reflink_p_merge, \
.trigger = bch2_trigger_reflink_p, \
.min_val_size = 16, \
})
-int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
-void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
- struct bkey_s_c);
+int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
- .key_invalid = bch2_reflink_v_invalid, \
+ .key_validate = bch2_reflink_v_validate, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
.trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
-int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
int bch2_trigger_indirect_inline_data(struct btree_trans *,
@@ -47,7 +43,7 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
- .key_invalid = bch2_indirect_inline_data_invalid, \
+ .key_validate = bch2_indirect_inline_data_validate, \
.val_to_text = bch2_indirect_inline_data_to_text, \
.trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 10c96cb2047a..1f34c92a6d11 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -24,7 +24,6 @@ static int bch2_memcmp(const void *l, const void *r, const void *priv)
static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- BUG_ON(e->data_type >= BCH_DATA_NR);
BUG_ON(!e->nr_devs);
BUG_ON(e->nr_required > 1 &&
e->nr_required >= e->nr_devs);
@@ -83,7 +82,8 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
}
for (unsigned i = 0; i < r->nr_devs; i++)
- if (!bch2_member_exists(sb, r->devs[i])) {
+ if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_member_exists(sb, r->devs[i])) {
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
goto bad;
}
@@ -452,7 +452,8 @@ retry:
.type = BCH_DISK_ACCOUNTING_replicas,
};
- memcpy(&k.replicas, e, replicas_entry_bytes(e));
+ unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
+ "embedded variable length struct");
struct bpos p = disk_accounting_pos_to_bpos(&k);
@@ -795,7 +796,7 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
nr_online += test_bit(e->devs[i], devs.d);
struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
- nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed;
+ nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
}
rcu_read_unlock();
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index dfbbd33c8731..c7e4cdd3f6a5 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -61,6 +61,23 @@
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(disk_accounting_v3, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_bkey_version_in_future, \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_accounting_mismatch, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
+ BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \
+ BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
+ BCH_FSCK_ERR_accounting_key_junk_at_end) \
+ x(disk_accounting_inum, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(rebalance_work_acct_fix, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_accounting_mismatch)
#define DOWNGRADE_TABLE() \
@@ -79,7 +96,25 @@
BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
- BCH_FSCK_ERR_bkey_version_in_future)
+ BCH_FSCK_ERR_bkey_version_in_future) \
+ x(disk_accounting_v3, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_fs_usage_hidden_wrong, \
+ BCH_FSCK_ERR_fs_usage_btree_wrong, \
+ BCH_FSCK_ERR_fs_usage_data_wrong, \
+ BCH_FSCK_ERR_fs_usage_cached_wrong, \
+ BCH_FSCK_ERR_fs_usage_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
+ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_replicas_wrong, \
+ BCH_FSCK_ERR_accounting_replicas_not_marked, \
+ BCH_FSCK_ERR_bkey_version_in_future) \
+ x(rebalance_work_acct_fix, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_accounting_mismatch)
struct upgrade_downgrade_entry {
u64 recovery_passes;
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d1b2f2aa397a..f0c14702f9e6 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -23,7 +23,7 @@ enum bch_fsck_flags {
x(jset_past_bucket_end, 9, 0) \
x(jset_seq_blacklisted, 10, 0) \
x(journal_entries_missing, 11, 0) \
- x(journal_entry_replicas_not_marked, 12, 0) \
+ x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
x(journal_entry_past_jset_end, 13, 0) \
x(journal_entry_replicas_data_mismatch, 14, 0) \
x(journal_entry_bkey_u64s_0, 15, 0) \
@@ -287,7 +287,11 @@ enum bch_fsck_flags {
x(accounting_replicas_not_marked, 273, 0) \
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
- x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX)
+ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
+ x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
+ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 39196f2a4197..4b765422dd77 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -11,7 +11,8 @@
void bch2_dev_missing(struct bch_fs *c, unsigned dev)
{
- bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
+ if (dev != BCH_SB_MEMBER_INVALID)
+ bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
}
void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
index e2630548c0f6..d727d2dfda08 100644
--- a/fs/bcachefs/sb-members_format.h
+++ b/fs/bcachefs/sb-members_format.h
@@ -8,6 +8,11 @@
*/
#define BCH_SB_MEMBERS_MAX 64
+/*
+ * Sentinal value - indicates a device that does not exist
+ */
+#define BCH_SB_MEMBER_INVALID 255
+
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 96744b1a76f5..8b18a9b483a4 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -31,15 +31,14 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(t.v->root_snapshot));
}
-int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
int ret = 0;
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)), c, err,
- snapshot_tree_pos_bad,
+ bkey_lt(k.k->p, POS(0, 1)),
+ c, snapshot_tree_pos_bad,
"bad pos");
fsck_err:
return ret;
@@ -225,55 +224,54 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(s.v->skip[2]));
}
-int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_snapshot s;
u32 i, id;
int ret = 0;
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1)), c, err,
- snapshot_pos_bad,
+ bkey_lt(k.k->p, POS(0, 1)),
+ c, snapshot_pos_bad,
"bad pos");
s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
- bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
- snapshot_parent_bad,
+ bkey_fsck_err_on(id && id <= k.k->p.offset,
+ c, snapshot_parent_bad,
"bad parent node (%u <= %llu)",
id, k.k->p.offset);
- bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
- snapshot_children_not_normalized,
+ bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
+ c, snapshot_children_not_normalized,
"children not normalized");
- bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
- snapshot_child_duplicate,
+ bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
+ c, snapshot_child_duplicate,
"duplicate child nodes");
for (i = 0; i < 2; i++) {
id = le32_to_cpu(s.v->children[i]);
- bkey_fsck_err_on(id >= k.k->p.offset, c, err,
- snapshot_child_bad,
+ bkey_fsck_err_on(id >= k.k->p.offset,
+ c, snapshot_child_bad,
"bad child node (%u >= %llu)",
id, k.k->p.offset);
}
if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
- le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
- snapshot_skiplist_not_normalized,
+ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
+ c, snapshot_skiplist_not_normalized,
"skiplist not normalized");
for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
id = le32_to_cpu(s.v->skip[i]);
- bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
- snapshot_skiplist_bad,
+ bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
+ c, snapshot_skiplist_bad,
"bad skiplist node %u", id);
}
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 31b0ee03e962..eb5ef64221d6 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -5,11 +5,11 @@
enum bch_validate_flags;
void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags);
#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
- .key_invalid = bch2_snapshot_tree_invalid, \
+ .key_validate = bch2_snapshot_tree_validate, \
.val_to_text = bch2_snapshot_tree_to_text, \
.min_val_size = 8, \
})
@@ -19,14 +19,13 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
- .key_invalid = bch2_snapshot_invalid, \
+ .key_validate = bch2_snapshot_validate, \
.val_to_text = bch2_snapshot_to_text, \
.trigger = bch2_mark_snapshot, \
.min_val_size = 24, \
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index f56720b55862..dbe834cb349f 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -207,23 +207,23 @@ int bch2_check_subvol_children(struct bch_fs *c)
/* Subvolumes: */
-int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags, struct printbuf *err)
+int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
int ret = 0;
bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
- bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
- subvol_pos_bad,
+ bkey_gt(k.k->p, SUBVOL_POS_MAX),
+ c, subvol_pos_bad,
"invalid pos");
- bkey_fsck_err_on(!subvol.v->snapshot, c, err,
- subvol_snapshot_bad,
+ bkey_fsck_err_on(!subvol.v->snapshot,
+ c, subvol_snapshot_bad,
"invalid snapshot");
- bkey_fsck_err_on(!subvol.v->inode, c, err,
- subvol_inode_bad,
+ bkey_fsck_err_on(!subvol.v->inode,
+ c, subvol_inode_bad,
"invalid inode");
fsck_err:
return ret;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index afa5e871efb2..a8299ba2cab2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -10,15 +10,14 @@ enum bch_validate_flags;
int bch2_check_subvols(struct bch_fs *);
int bch2_check_subvol_children(struct bch_fs *);
-int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
- .key_invalid = bch2_subvolume_invalid, \
+ .key_validate = bch2_subvolume_validate, \
.val_to_text = bch2_subvolume_to_text, \
.trigger = bch2_subvolume_trigger, \
.min_val_size = 16, \
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 8bc819832790..c8c2ccbdfbb5 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -414,6 +414,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
+ !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
+ SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
}
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0455a1001fec..e7fa2de35014 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1193,7 +1193,6 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
- kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1c0d1fb20276..33f2a64c14c9 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -367,7 +367,7 @@ SHOW(bch2_fs)
bch2_stripes_heap_to_text(out, c);
if (attr == &sysfs_open_buckets)
- bch2_open_buckets_to_text(out, c);
+ bch2_open_buckets_to_text(out, c, NULL);
if (attr == &sysfs_open_buckets_partial)
bch2_open_buckets_partial_to_text(out, c);
@@ -461,7 +461,7 @@ STORE(bch2_fs)
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_key_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc);
}
if (attr == &sysfs_trigger_gc)
@@ -811,6 +811,9 @@ SHOW(bch2_dev)
if (attr == &sysfs_alloc_debug)
bch2_dev_alloc_debug_to_text(out, ca);
+ if (attr == &sysfs_open_buckets)
+ bch2_open_buckets_to_text(out, c, ca);
+
return 0;
}
@@ -892,6 +895,7 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
+ &sysfs_open_buckets,
NULL
};
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index dc48b52b01b4..dfad1d06633d 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -4,6 +4,7 @@
#include "buckets.h"
#include "btree_cache.h"
#include "btree_iter.h"
+#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index d0e6b9deb6cb..c62f00322d1e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -988,10 +988,33 @@ TRACE_EVENT(trans_restart_split_race,
__entry->u64s_remaining)
);
-DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim,
+TRACE_EVENT(trans_blocked_journal_reclaim,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ TP_ARGS(trans, caller_ip),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+
+ __field(unsigned long, key_cache_nr_keys )
+ __field(unsigned long, key_cache_nr_dirty )
+ __field(long, must_wait )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->key_cache_nr_keys = atomic_long_read(&trans->c->btree_key_cache.nr_keys);
+ __entry->key_cache_nr_dirty = atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
+ __entry->must_wait = __bch2_btree_key_cache_must_wait(trans->c);
+ ),
+
+ TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
+ __entry->trans_fn, (void *) __entry->caller_ip,
+ __entry->key_cache_nr_keys,
+ __entry->key_cache_nr_dirty,
+ __entry->must_wait)
);
TRACE_EVENT(trans_restart_journal_preres_get,
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 138320eaa2ad..1b8554460af4 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
prt_printf(out, "\tsince mount\r\trecent\r\n");
- prt_printf(out, "recent");
printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, out->indent + 20);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index c11bf6dacc2c..331f944d73dc 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -70,17 +70,16 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
.cmp_bkey = xattr_cmp_bkey,
};
-int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
- enum bch_validate_flags flags,
- struct printbuf *err)
+int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags)
{
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
le16_to_cpu(xattr.v->x_val_len));
int ret = 0;
- bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
- xattr_val_size_too_small,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
+ c, xattr_val_size_too_small,
"value too small (%zu < %u)",
bkey_val_u64s(k.k), val_u64s);
@@ -88,17 +87,17 @@ int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
val_u64s = xattr_val_u64s(xattr.v->x_name_len,
le16_to_cpu(xattr.v->x_val_len) + 4);
- bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
- xattr_val_size_too_big,
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
+ c, xattr_val_size_too_big,
"value too big (%zu > %u)",
bkey_val_u64s(k.k), val_u64s);
- bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
- xattr_invalid_type,
+ bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
+ c, xattr_invalid_type,
"invalid type (%u)", xattr.v->x_type);
- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
- xattr_name_invalid_chars,
+ bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
+ c, xattr_name_invalid_chars,
"xattr name has invalid characters");
fsck_err:
return ret;
@@ -613,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective(
name, buffer, size, true);
}
+/* Noop - xattrs in the bcachefs_effective namespace are inherited */
+static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return 0;
+}
+
static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
.prefix = "bcachefs_effective.",
.get = bch2_xattr_bcachefs_get_effective,
- .set = bch2_xattr_bcachefs_set,
+ .set = bch2_xattr_bcachefs_set_effective,
};
#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 1574b9eb4c85..c188a5ad64ce 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,12 +6,11 @@
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
- enum bch_validate_flags, struct printbuf *);
+int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_xattr ((struct bkey_ops) { \
- .key_invalid = bch2_xattr_invalid, \
+ .key_validate = bch2_xattr_validate, \
.val_to_text = bch2_xattr_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index a778411574a9..fa66a09e496a 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -172,11 +172,11 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
static int bfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block);
+ ret = block_write_begin(mapping, pos, len, foliop, bfs_get_block);
if (unlikely(ret))
bfs_write_failed(mapping, pos + len);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 19fa49cd9907..34d0d1e43f36 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1314,6 +1314,11 @@ out_free_interp:
emulate the SVr4 behavior. Sigh. */
error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE, 0);
+
+ retval = do_mseal(0, PAGE_SIZE, 0);
+ if (retval)
+ pr_warn_ratelimited("pid=%d, couldn't seal address 0, ret=%d.\n",
+ task_pid_nr(current), retval);
}
regs = current_pt_regs();
@@ -2027,8 +2032,10 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm)) {
+ coredump_report_failure("Error collecting note info");
goto end_coredump;
+ }
has_dumped = 1;
@@ -2039,12 +2046,14 @@ static int elf_core_dump(struct coredump_params *cprm)
{
size_t sz = info.size;
- /* For cell spufs */
+ /* For cell spufs and x86 xstate */
sz += elf_coredump_extra_notes_size();
phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
- if (!phdr4note)
+ if (!phdr4note) {
+ coredump_report_failure("Error allocating program headers note entry");
goto end_coredump;
+ }
fill_elf_note_phdr(phdr4note, sz, offset);
offset += sz;
@@ -2058,18 +2067,24 @@ static int elf_core_dump(struct coredump_params *cprm)
if (e_phnum == PN_XNUM) {
shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
- if (!shdr4extnum)
+ if (!shdr4extnum) {
+ coredump_report_failure("Error allocating extra program headers");
goto end_coredump;
+ }
fill_extnum_info(&elf, shdr4extnum, e_shoff, segs);
}
offset = dataoff;
- if (!dump_emit(cprm, &elf, sizeof(elf)))
+ if (!dump_emit(cprm, &elf, sizeof(elf))) {
+ coredump_report_failure("Error emitting the ELF headers");
goto end_coredump;
+ }
- if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
+ if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) {
+ coredump_report_failure("Error emitting the program header for notes");
goto end_coredump;
+ }
/* Write program headers for segments dump */
for (i = 0; i < cprm->vma_count; i++) {
@@ -2092,20 +2107,28 @@ static int elf_core_dump(struct coredump_params *cprm)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
- if (!dump_emit(cprm, &phdr, sizeof(phdr)))
+ if (!dump_emit(cprm, &phdr, sizeof(phdr))) {
+ coredump_report_failure("Error emitting program headers");
goto end_coredump;
+ }
}
- if (!elf_core_write_extra_phdrs(cprm, offset))
+ if (!elf_core_write_extra_phdrs(cprm, offset)) {
+ coredump_report_failure("Error writing out extra program headers");
goto end_coredump;
+ }
/* write out the notes section */
- if (!write_note_info(&info, cprm))
+ if (!write_note_info(&info, cprm)) {
+ coredump_report_failure("Error writing out notes");
goto end_coredump;
+ }
- /* For cell spufs */
- if (elf_coredump_extra_notes_write(cprm))
+ /* For cell spufs and x86 xstate */
+ if (elf_coredump_extra_notes_write(cprm)) {
+ coredump_report_failure("Error writing out extra notes");
goto end_coredump;
+ }
/* Align to page */
dump_skip_to(cprm, dataoff);
@@ -2113,16 +2136,22 @@ static int elf_core_dump(struct coredump_params *cprm)
for (i = 0; i < cprm->vma_count; i++) {
struct core_vma_metadata *meta = cprm->vma_meta + i;
- if (!dump_user_range(cprm, meta->start, meta->dump_size))
+ if (!dump_user_range(cprm, meta->start, meta->dump_size)) {
+ coredump_report_failure("Error writing out the process memory");
goto end_coredump;
+ }
}
- if (!elf_core_write_extra_data(cprm))
+ if (!elf_core_write_extra_data(cprm)) {
+ coredump_report_failure("Error writing out extra data");
goto end_coredump;
+ }
if (e_phnum == PN_XNUM) {
- if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
+ if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) {
+ coredump_report_failure("Error emitting extra program headers");
goto end_coredump;
+ }
}
end_coredump:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 28a3439f163a..4fe5bb9f1b1f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -589,6 +589,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
if (bprm->have_execfd)
nitems++;
+#ifdef ELF_HWCAP2
+ nitems++;
+#endif
csp = sp;
sp -= nitems * 2 * sizeof(unsigned long);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index c26545d71d39..cd6d5bbb4b9d 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -72,8 +72,10 @@
#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
#define DATA_START_OFFSET_WORDS (0)
+#define MAX_SHARED_LIBS_UPDATE (0)
#else
#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS)
+#define MAX_SHARED_LIBS_UPDATE (MAX_SHARED_LIBS)
#endif
struct lib_info {
@@ -880,7 +882,7 @@ static int load_flat_binary(struct linux_binprm *bprm)
return res;
/* Update data segment pointers for all libraries */
- for (i = 0; i < MAX_SHARED_LIBS; i++) {
+ for (i = 0; i < MAX_SHARED_LIBS_UPDATE; i++) {
if (!libinfo.lib_list[i].loaded)
continue;
for (j = 0; j < MAX_SHARED_LIBS; j++) {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a2de5c05f97c..e2f478ecd7fd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -219,8 +219,8 @@ static void free_pref(struct prelim_ref *ref)
* A -1 return indicates ref1 is a 'lower' block than ref2, while 1
* indicates a 'higher' block.
*/
-static int prelim_ref_compare(struct prelim_ref *ref1,
- struct prelim_ref *ref2)
+static int prelim_ref_compare(const struct prelim_ref *ref1,
+ const struct prelim_ref *ref2)
{
if (ref1->level < ref2->level)
return -1;
@@ -251,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
}
static void update_share_count(struct share_check *sc, int oldcount,
- int newcount, struct prelim_ref *newref)
+ int newcount, const struct prelim_ref *newref)
{
if ((!sc) || (oldcount == 0 && newcount < 1))
return;
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index f04d93109960..fec5c6cde0a7 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -53,7 +53,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
/*
* Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
- * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
*
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool.
@@ -73,20 +73,13 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
struct btrfs_bio *orig_bbio,
- u64 map_length, bool use_append)
+ u64 map_length)
{
struct btrfs_bio *bbio;
struct bio *bio;
- if (use_append) {
- unsigned int nr_segs;
-
- bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
- &btrfs_clone_bioset, map_length);
- } else {
- bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
- GFP_NOFS, &btrfs_clone_bioset);
- }
+ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
+ &btrfs_clone_bioset);
bbio = btrfs_bio(bio);
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
bbio->inode = orig_bbio->inode;
@@ -120,12 +113,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
}
}
-void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
-{
- bbio->bio.bi_status = status;
- __btrfs_bio_end_io(bbio);
-}
-
static void btrfs_orig_write_end_io(struct bio *bio);
static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
@@ -147,8 +134,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
}
}
-static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
+void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
+ bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
@@ -179,7 +167,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
{
if (atomic_dec_and_test(&fbio->repair_count)) {
- btrfs_orig_bbio_end_io(fbio->bbio);
+ btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
mempool_free(fbio, &btrfs_failed_bio_pool);
}
}
@@ -211,7 +199,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
goto done;
}
- btrfs_submit_bio(repair_bbio, mirror);
+ btrfs_submit_bbio(repair_bbio, mirror);
return;
}
@@ -280,7 +268,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
- btrfs_submit_bio(repair_bbio, mirror);
+ btrfs_submit_bbio(repair_bbio, mirror);
return fbio;
}
@@ -326,7 +314,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
if (fbio)
btrfs_repair_done(fbio);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
@@ -360,7 +348,7 @@ static void btrfs_end_bio_work(struct work_struct *work)
if (is_data_bbio(bbio))
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
static void btrfs_simple_end_io(struct bio *bio)
@@ -380,7 +368,7 @@ static void btrfs_simple_end_io(struct bio *bio)
} else {
if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
btrfs_record_physical_zoned(bbio);
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
}
@@ -394,7 +382,7 @@ static void btrfs_raid56_end_io(struct bio *bio)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
btrfs_check_read_bio(bbio, NULL);
else
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -424,7 +412,7 @@ static void btrfs_orig_write_end_io(struct bio *bio)
if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
- btrfs_orig_bbio_end_io(bbio);
+ btrfs_bio_end_io(bbio, bbio->bio.bi_status);
btrfs_put_bioc(bioc);
}
@@ -502,8 +490,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
-static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
- struct btrfs_io_stripe *smap, int mirror_num)
+static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
+ struct btrfs_io_stripe *smap, int mirror_num)
{
if (!bioc) {
/* Single mirror read/write fast path. */
@@ -593,7 +581,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
- btrfs_orig_bbio_end_io(async->bbio);
+ btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
return;
}
@@ -603,7 +591,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
* context. This changes nothing when cgroups aren't in use.
*/
bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
- __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
+ btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}
static bool should_async_write(struct btrfs_bio *bbio)
@@ -664,11 +652,23 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
return true;
}
+static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
+{
+ unsigned int nr_segs;
+ int sector_offset;
+
+ map_length = min(map_length, bbio->fs_info->max_zone_append_size);
+ sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+ &nr_segs, map_length);
+ if (sector_offset)
+ return sector_offset << SECTOR_SHIFT;
+ return map_length;
+}
+
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = bbio->fs_info;
- struct btrfs_bio *orig_bbio = bbio;
struct bio *bio = &bbio->bio;
u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 length = bio->bi_iter.bi_size;
@@ -679,7 +679,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
blk_status_t ret;
int error;
- smap.is_scrub = !bbio->inode;
+ if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
+ smap.rst_search_commit_root = true;
+ else
+ smap.rst_search_commit_root = false;
btrfs_bio_counter_inc_blocked(fs_info);
error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
@@ -691,10 +694,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
map_length = min(map_length, length);
if (use_append)
- map_length = min(map_length, fs_info->max_zone_append_size);
+ map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
- bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+ bbio = btrfs_split_bio(fs_info, bbio, map_length);
bio = &bbio->bio;
}
@@ -706,7 +709,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
if (ret)
- goto fail_put_bio;
+ goto fail;
}
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
@@ -740,31 +743,40 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
ret = btrfs_bio_csum(bbio);
if (ret)
- goto fail_put_bio;
+ goto fail;
} else if (use_append ||
(btrfs_is_zoned(fs_info) && inode &&
inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
if (ret)
- goto fail_put_bio;
+ goto fail;
}
}
- __btrfs_submit_bio(bio, bioc, &smap, mirror_num);
+ btrfs_submit_bio(bio, bioc, &smap, mirror_num);
done:
return map_length == length;
-fail_put_bio:
- if (map_length < length)
- btrfs_cleanup_bio(bbio);
fail:
btrfs_bio_counter_dec(fs_info);
- btrfs_bio_end_io(orig_bbio, ret);
+ /*
+ * We have split the original bbio, now we have to end both the current
+ * @bbio and remaining one, as the remaining one will never be submitted.
+ */
+ if (map_length < length) {
+ struct btrfs_bio *remaining = bbio->private;
+
+ ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
+ ASSERT(remaining);
+
+ btrfs_bio_end_io(remaining, ret);
+ }
+ btrfs_bio_end_io(bbio, ret);
/* Do not submit another chunk */
return true;
}
-void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
+void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
{
/* If bbio->inode is not populated, its file_offset must be 0. */
ASSERT(bbio->inode || bbio->file_offset == 0);
@@ -776,7 +788,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
/*
* Submit a repair write.
*
- * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
+ * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
* RAID setup. Here we only want to write the one bad copy, so we do the
* mapping ourselves and submit the bio directly.
*
@@ -865,7 +877,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_
ASSERT(smap.dev == fs_info->dev_replace.srcdev);
smap.dev = fs_info->dev_replace.tgtdev;
}
- __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
+ btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
return;
fail:
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index d9dd5276093d..e48612340745 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -29,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
/*
* Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and
- * passed to btrfs_submit_bio for mapping to the physical devices.
+ * passed to btrfs_submit_bbio() for mapping to the physical devices.
*/
struct btrfs_bio {
/*
@@ -42,7 +42,7 @@ struct btrfs_bio {
union {
/*
* For data reads: checksumming and original I/O information.
- * (for internal use in the btrfs_submit_bio machinery only)
+ * (for internal use in the btrfs_submit_bbio() machinery only)
*/
struct {
u8 *csum;
@@ -104,7 +104,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
/* Submit using blkcg_punt_bio_submit. */
#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
-void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
+void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct folio *folio,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 2e49d978f504..7980b2e33a92 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -23,7 +23,7 @@
#include "extent-tree.h"
#ifdef CONFIG_BTRFS_DEBUG
-int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
+int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -40,9 +40,9 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
*
* Should be called with balance_lock held
*/
-static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
{
- struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
u64 target = 0;
if (!bctl)
@@ -1415,9 +1415,9 @@ out:
}
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *bg)
+ const struct btrfs_block_group *bg)
{
- struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *prev_trans = NULL;
const u64 start = bg->start;
const u64 end = start + bg->length - 1;
@@ -1756,14 +1756,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
-static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
{
if (btrfs_is_zoned(fs_info))
return btrfs_zoned_should_reclaim(fs_info);
return true;
}
-static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
+static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
{
const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
@@ -2006,8 +2006,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
-static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
- struct btrfs_path *path)
+static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
+ const struct btrfs_path *path)
{
struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
@@ -2055,7 +2055,7 @@ out_free_map:
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
@@ -2640,8 +2640,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
}
static int insert_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 chunk_offset,
- u64 start, u64 num_bytes)
+ const struct btrfs_device *device, u64 chunk_offset,
+ u64 start, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
@@ -2817,7 +2817,7 @@ next:
* For extent tree v2 we use the block_group_item->chunk_offset to point at our
* global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
*/
-static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
{
u64 div = SZ_1G;
u64 index;
@@ -3842,8 +3842,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
}
-static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *sinfo, int force)
+static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
@@ -4218,7 +4218,7 @@ out:
return ret;
}
-static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
+static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
{
u64 num_dev;
@@ -4622,7 +4622,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
return 0;
}
-bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
+bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
{
if (btrfs_is_zoned(bg->fs_info))
return false;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 915111338fc0..36937eeab9b8 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -266,7 +266,7 @@ struct btrfs_block_group {
u64 reclaim_mark;
};
-static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
{
return (block_group->start + block_group->length);
}
@@ -278,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
}
-static inline bool btrfs_is_block_group_data_only(
- struct btrfs_block_group *block_group)
+static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
{
/*
* In mixed mode the fragmentation is expected to be high, lowering the
@@ -290,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only(
}
#ifdef CONFIG_BTRFS_DEBUG
-int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
+int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group);
#endif
struct btrfs_block_group *btrfs_lookup_first_block_group(
@@ -370,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}
-static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
+static inline int btrfs_block_group_done(const struct btrfs_block_group *cache)
{
smp_mb();
return cache->cached == BTRFS_CACHE_FINISHED ||
@@ -387,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
enum btrfs_block_group_size_class size_class,
bool force_wrong_size_class);
-bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);
+bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index b299b82d676e..a07b9594dc70 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -553,7 +553,7 @@ try_reserve:
return ERR_PTR(ret);
}
-int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
u64 needed_bytes;
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 1f53b967d069..d12b1fac5c74 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -89,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
-int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
+int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3056c8aed8ef..9a4b7c119318 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -350,10 +350,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
WRITE_ONCE(inode->first_dir_index_to_log, index);
}
-static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
-{
- return container_of(inode, struct btrfs_inode, vfs_inode);
-}
+/* Type checked and const-preserving VFS inode -> btrfs inode. */
+#define BTRFS_I(_inode) \
+ _Generic(_inode, \
+ struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \
+ const struct inode *: (const struct btrfs_inode *)container_of( \
+ _inode, const struct btrfs_inode, vfs_inode))
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
@@ -505,6 +507,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
return true;
}
+static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode)
+{
+ /* Immediately trigger a crash if the inode is not locked. */
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+ /* Trigger a splat in dmesg if this task is not holding the lock. */
+ lockdep_assert_held(&inode->vfs_inode.i_rwsem);
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
@@ -578,7 +588,7 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
struct btrfs_path *path);
struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, u64 start, u64 len);
+ struct folio *folio, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
@@ -596,9 +606,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc);
-int btrfs_writepage_cow_fixup(struct page *page);
+int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a8e2c461aff7..90aef2627ca2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -138,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
+ const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
- case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
- case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
+ case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
- case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio,
dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
@@ -395,7 +395,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb->bbio.ordered = ordered;
btrfs_add_compressed_bio_folios(cb);
- btrfs_submit_bio(&cb->bbio, 0);
+ btrfs_submit_bbio(&cb->bbio, 0);
}
/*
@@ -420,7 +420,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
int ret;
- struct page *page;
+ struct folio *folio;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
@@ -453,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- page = xa_load(&mapping->i_pages, pg_index);
- if (page && !xa_is_value(page)) {
- sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+ folio = __filemap_get_folio(mapping, pg_index, 0, 0);
+ if (!IS_ERR(folio)) {
+ u64 folio_sz = folio_size(folio);
+ u64 offset = offset_in_folio(folio, cur);
+
+ folio_put(folio);
+ sectors_missed += (folio_sz - offset) >>
fs_info->sectorsize_bits;
/* Beyond threshold, no need to continue */
@@ -466,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* Jump to next page start as we already have page for
* current offset.
*/
- cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ cur += (folio_sz - offset);
continue;
}
- page = __page_cache_alloc(mapping_gfp_constraint(mapping,
- ~__GFP_FS));
- if (!page)
+ folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
+ ~__GFP_FS), 0);
+ if (!folio)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
- put_page(page);
+ if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) {
/* There is already a page, skip to page end */
- cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+ cur += folio_size(folio);
+ folio_put(folio);
continue;
}
- if (!*memstall && PageWorkingset(page)) {
+ if (!*memstall && folio_test_workingset(folio)) {
psi_memstall_enter(pflags);
*memstall = 1;
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
- page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+ page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
@@ -511,28 +515,28 @@ static noinline int add_ra_bio_pages(struct inode *inode,
orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
add_size = min(em->start + em->len, page_end + 1) - cur;
free_extent_map(em);
+ unlock_extent(tree, cur, page_end, NULL);
- if (page->index == end_index) {
- size_t zero_offset = offset_in_page(isize);
+ if (folio->index == end_index) {
+ size_t zero_offset = offset_in_folio(folio, isize);
if (zero_offset) {
int zeros;
- zeros = PAGE_SIZE - zero_offset;
- memzero_page(page, zero_offset, zeros);
+ zeros = folio_size(folio) - zero_offset;
+ folio_zero_range(folio, zero_offset, zeros);
}
}
- ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
- if (ret != add_size) {
- unlock_extent(tree, cur, page_end, NULL);
- unlock_page(page);
- put_page(page);
+ if (!bio_add_folio(orig_bio, folio, add_size,
+ offset_in_folio(folio, cur))) {
+ folio_unlock(folio);
+ folio_put(folio);
break;
}
/*
@@ -541,9 +545,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, page_folio(page),
- cur, add_size);
- put_page(page);
+ btrfs_subpage_start_reader(fs_info, folio, cur,
+ add_size);
+ folio_put(folio);
cur += add_size;
}
return 0;
@@ -626,7 +630,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
if (memstall)
psi_memstall_leave(&pflags);
- btrfs_submit_bio(&cb->bbio, 0);
+ btrfs_submit_bbio(&cb->bbio, 0);
return;
out_free_compressed_pages:
@@ -1057,10 +1061,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
-int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
- struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
struct list_head *workspace;
const u32 sectorsize = fs_info->sectorsize;
int ret;
@@ -1073,7 +1077,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
workspace = get_workspace(type, 0);
- ret = compression_decompress(type, workspace, data_in, dest_page,
+ ret = compression_decompress(type, workspace, data_in, dest_folio,
dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index cfdc64319186..b6563b6a333e 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -82,13 +82,21 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
return ((type_level & 0xF0) >> 4);
}
+/* @range_end must be exclusive. */
+static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
+{
+ u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE;
+
+ return min(range_end, page_end) - cur;
+}
+
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
-int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
+int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
@@ -154,7 +162,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
@@ -165,7 +173,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
@@ -175,7 +183,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 451203055bbf..0cc919d15b14 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2564,8 +2564,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
*
*/
static void fixup_low_keys(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_disk_key *key, int level)
+ const struct btrfs_path *path,
+ const struct btrfs_disk_key *key, int level)
{
int i;
struct extent_buffer *t;
@@ -2594,7 +2594,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
* that the new key won't break the order
*/
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const struct btrfs_key *new_key)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2660,8 +2660,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
* is correct, we only need to bother the last key of @left and the first
* key of @right.
*/
-static bool check_sibling_keys(struct extent_buffer *left,
- struct extent_buffer *right)
+static bool check_sibling_keys(const struct extent_buffer *left,
+ const struct extent_buffer *right)
{
struct btrfs_key left_last;
struct btrfs_key right_first;
@@ -2928,8 +2928,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
* blocknr is the block the key points to.
*/
static int insert_ptr(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct btrfs_disk_key *key, u64 bytenr,
+ const struct btrfs_path *path,
+ const struct btrfs_disk_key *key, u64 bytenr,
int slot, int level)
{
struct extent_buffer *lower;
@@ -4019,7 +4019,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
* the front.
*/
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 new_size, int from_end)
+ const struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
@@ -4111,7 +4111,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
* make the item pointed to by the path bigger, data_size is the added size.
*/
void btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 data_size)
+ const struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 75fa563e4cac..1a44fb9845e3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,6 +6,7 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
+#include "linux/cleanup.h"
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
@@ -84,6 +85,9 @@ struct btrfs_path {
unsigned int nowait:1;
};
+#define BTRFS_PATH_AUTO_FREE(path_name) \
+ struct btrfs_path *path_name __free(btrfs_free_path) = NULL
+
/*
* The state of btrfs root
*/
@@ -459,7 +463,6 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
- bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
@@ -539,7 +542,7 @@ int btrfs_previous_item(struct btrfs_root *root,
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid);
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
@@ -573,9 +576,9 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 data_size);
+ const struct btrfs_path *path, u32 data_size);
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u32 new_size, int from_end);
+ const struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -599,6 +602,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
+DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T))
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index f6dbda37a361..acf1f39e45d0 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -45,8 +45,8 @@ struct inode_defrag {
u32 extent_thresh;
};
-static int __compare_inode_defrag(struct inode_defrag *defrag1,
- struct inode_defrag *defrag2)
+static int compare_inode_defrag(const struct inode_defrag *defrag1,
+ const struct inode_defrag *defrag2)
{
if (defrag1->root > defrag2->root)
return 1;
@@ -61,16 +61,14 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
}
/*
- * Pop a record for an inode into the defrag tree. The lock must be held
+ * Insert a record for an inode into the defrag tree. The lock must be held
* already.
*
* If you're inserting a record for an older transid than an existing record,
* the transid already in the tree is lowered.
- *
- * If an existing record is found the defrag item you pass in is freed.
*/
-static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
+static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
+ struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct inode_defrag *entry;
@@ -83,7 +81,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- ret = __compare_inode_defrag(defrag, entry);
+ ret = compare_inode_defrag(defrag, entry);
if (ret < 0)
p = &parent->rb_left;
else if (ret > 0)
@@ -107,7 +105,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
return 0;
}
-static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
+static inline int need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
return 0;
@@ -119,34 +117,28 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
}
/*
- * Insert a defrag record for this inode if auto defrag is enabled.
+ * Insert a defrag record for this inode if auto defrag is enabled. No errors
+ * returned as they're not considered fatal.
*/
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh)
+void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode_defrag *defrag;
- u64 transid;
int ret;
- if (!__need_auto_defrag(fs_info))
- return 0;
+ if (!need_auto_defrag(fs_info))
+ return;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
- return 0;
-
- if (trans)
- transid = trans->transid;
- else
- transid = btrfs_get_root_last_trans(root);
+ return;
defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
- return -ENOMEM;
+ return;
defrag->ino = btrfs_ino(inode);
- defrag->transid = transid;
+ defrag->transid = btrfs_get_root_last_trans(root);
defrag->root = btrfs_root_id(root);
defrag->extent_thresh = extent_thresh;
@@ -157,14 +149,13 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
* and then re-read this inode, this new inode doesn't have
* IN_DEFRAG flag. At the case, we may find the existed defrag.
*/
- ret = __btrfs_add_inode_defrag(inode, defrag);
+ ret = btrfs_insert_inode_defrag(inode, defrag);
if (ret)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
} else {
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
spin_unlock(&fs_info->defrag_inodes_lock);
- return 0;
}
/*
@@ -189,7 +180,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
- ret = __compare_inode_defrag(&tmp, entry);
+ ret = compare_inode_defrag(&tmp, entry);
if (ret < 0)
p = parent->rb_left;
else if (ret > 0)
@@ -198,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
goto out;
}
- if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ if (parent && compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -214,27 +205,22 @@ out:
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
- struct inode_defrag *defrag;
- struct rb_node *node;
+ struct inode_defrag *defrag, *next;
spin_lock(&fs_info->defrag_inodes_lock);
- node = rb_first(&fs_info->defrag_inodes);
- while (node) {
- rb_erase(node, &fs_info->defrag_inodes);
- defrag = rb_entry(node, struct inode_defrag, rb_node);
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- cond_resched_lock(&fs_info->defrag_inodes_lock);
+ rbtree_postorder_for_each_entry_safe(defrag, next,
+ &fs_info->defrag_inodes, rb_node)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- node = rb_first(&fs_info->defrag_inodes);
- }
spin_unlock(&fs_info->defrag_inodes_lock);
}
#define BTRFS_DEFRAG_BATCH 1024
-static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
- struct inode_defrag *defrag)
+static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag,
+ struct file_ra_state *ra)
{
struct btrfs_root *inode_root;
struct inode *inode;
@@ -245,7 +231,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
again:
if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
goto cleanup;
- if (!__need_auto_defrag(fs_info))
+ if (!need_auto_defrag(fs_info))
goto cleanup;
/* Get the inode */
@@ -273,9 +259,10 @@ again:
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
+ file_ra_state_init(ra, inode->i_mapping);
sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
iput(inode);
@@ -302,11 +289,13 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
atomic_inc(&fs_info->defrag_running);
while (1) {
+ struct file_ra_state ra = { 0 };
+
/* Pause the auto defragger. */
if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
break;
- if (!__need_auto_defrag(fs_info))
+ if (!need_auto_defrag(fs_info))
break;
/* find an inode to defrag */
@@ -324,7 +313,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
- __btrfs_run_defrag_inode(fs_info, defrag);
+ btrfs_run_defrag_inode(fs_info, defrag, &ra);
}
atomic_dec(&fs_info->defrag_running);
@@ -1317,8 +1306,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
if (entry->start + range_len <= *last_scanned_ret)
continue;
- if (ra)
- page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+ page_cache_sync_readahead(inode->vfs_inode.i_mapping,
ra, NULL, entry->start >> PAGE_SHIFT,
((entry->start + range_len - 1) >> PAGE_SHIFT) -
(entry->start >> PAGE_SHIFT) + 1);
@@ -1350,7 +1338,7 @@ out:
* Entry point to file defragmentation.
*
* @inode: inode to be defragged
- * @ra: readahead state (can be NUL)
+ * @ra: readahead state
* @range: defrag options including range and flags
* @newer_than: minimum transid to defrag
* @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
@@ -1372,12 +1360,13 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
- bool ra_allocated = false;
int compress_type = BTRFS_COMPRESS_ZLIB;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;
+ ASSERT(ra);
+
if (isize == 0)
return 0;
@@ -1407,18 +1396,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
/*
- * If we were not given a ra, allocate a readahead context. As
- * readahead is just an optimization, defrag will work without it so
- * we don't error out.
- */
- if (!ra) {
- ra_allocated = true;
- ra = kzalloc(sizeof(*ra), GFP_KERNEL);
- if (ra)
- file_ra_state_init(ra, inode->i_mapping);
- }
-
- /*
* Make writeback start from the beginning of the range, so that the
* defrag range can be written sequentially.
*/
@@ -1472,8 +1449,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
cond_resched();
}
- if (ra_allocated)
- kfree(ra);
/*
* Update range.start for autodefrag, this will indicate where to start
* in next run.
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 878528e086fb..6b7596c4f0dc 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -18,8 +18,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
-int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, u32 extent_thresh);
+void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_defrag_root(struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 2ac9296edccb..ad9ef8312e41 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -855,11 +855,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
/* Record qgroup extent info if provided */
if (qrecord) {
- if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
- delayed_refs, qrecord))
+ int ret;
+
+ ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info,
+ delayed_refs, qrecord);
+ if (ret) {
+ /* Clean up if insertion fails or item exists. */
+ xa_release(&delayed_refs->dirty_extents, qrecord->bytenr);
kfree(qrecord);
- else
+ } else {
qrecord_inserted = true;
+ }
}
trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
@@ -1005,18 +1011,16 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
- return -ENOMEM;
- }
+ if (!head_ref)
+ goto free_node;
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
- if (!record) {
- kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
- kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
- return -ENOMEM;
- }
+ if (!record)
+ goto free_head_ref;
+ if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
+ generic_ref->bytenr, GFP_NOFS))
+ goto free_record;
}
init_delayed_ref_common(fs_info, node, generic_ref);
@@ -1052,6 +1056,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
if (qrecord_inserted)
return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
+
+free_record:
+ kfree(record);
+free_head_ref:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_node:
+ kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
+ return -ENOMEM;
}
/*
@@ -1134,6 +1146,73 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
return find_ref_head(delayed_refs, bytenr, false);
}
+static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
+{
+ int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY;
+
+ if (type < entry->type)
+ return -1;
+ if (type > entry->type)
+ return 1;
+
+ if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+ if (root < entry->ref_root)
+ return -1;
+ if (root > entry->ref_root)
+ return 1;
+ } else {
+ if (parent < entry->parent)
+ return -1;
+ if (parent > entry->parent)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Check to see if a given root/parent reference is attached to the head. This
+ * only checks for BTRFS_ADD_DELAYED_REF references that match, as that
+ * indicates the reference exists for the given root or parent. This is for
+ * tree blocks only.
+ *
+ * @head: the head of the bytenr we're searching.
+ * @root: the root objectid of the reference if it is a normal reference.
+ * @parent: the parent if this is a shared backref.
+ */
+bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
+ u64 root, u64 parent)
+{
+ struct rb_node *node;
+ bool found = false;
+
+ lockdep_assert_held(&head->mutex);
+
+ spin_lock(&head->lock);
+ node = head->ref_tree.rb_root.rb_node;
+ while (node) {
+ struct btrfs_delayed_ref_node *entry;
+ int ret;
+
+ entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+ ret = find_comp(entry, root, parent);
+ if (ret < 0) {
+ node = node->rb_left;
+ } else if (ret > 0) {
+ node = node->rb_right;
+ } else {
+ /*
+ * We only want to count ADD actions, as drops mean the
+ * ref doesn't exist.
+ */
+ if (entry->action == BTRFS_ADD_DELAYED_REF)
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&head->lock);
+ return found;
+}
+
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ef15e998be03..085f30968aba 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -202,8 +202,8 @@ struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
- /* dirty extent records */
- struct rb_root dirty_extent_root;
+ /* Track dirty extent records. */
+ struct xarray dirty_extents;
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
@@ -389,6 +389,8 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
+bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
+ u64 root, u64 parent);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f638c458d285..83d5cdd77f29 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -824,22 +824,45 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
- u64 start = 0;
- int i;
+ struct rb_node *node;
+
+ /*
+ * The chunk mutex must be held so that no new chunks can be created
+ * while we are updating existing chunks. This guarantees we don't miss
+ * any new chunk that gets created for a range that falls before the
+ * range of the last chunk we processed.
+ */
+ lockdep_assert_held(&fs_info->chunk_mutex);
write_lock(&fs_info->mapping_tree_lock);
- do {
+ node = rb_first_cached(&fs_info->mapping_tree);
+ while (node) {
+ struct rb_node *next = rb_next(node);
struct btrfs_chunk_map *map;
+ u64 next_start;
- map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
- if (!map)
- break;
- for (i = 0; i < map->num_stripes; i++)
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ next_start = map->start + map->chunk_len;
+
+ for (int i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
- start = map->start + map->chunk_len;
- btrfs_free_chunk_map(map);
- } while (start);
+
+ if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) {
+ map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX);
+ if (!map)
+ break;
+ node = &map->rb_node;
+ /*
+ * Drop the lookup reference since we are holding the
+ * lock in write mode and no one can remove the chunk
+ * map from the tree and drop its tree reference.
+ */
+ btrfs_free_chunk_map(map);
+ } else {
+ node = next;
+ }
+ }
write_unlock(&fs_info->mapping_tree_lock);
}
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 67adbe9d294a..bd38df5647e3 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct btrfs_ordered_extent *ordered;
int ret = 0;
+ /* Direct lock must be taken before the extent lock. */
+ if (nowait) {
+ if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
+ return -EAGAIN;
+ } else {
+ lock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ }
+
while (1) {
if (nowait) {
if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state))
- return -EAGAIN;
+ cached_state)) {
+ ret = -EAGAIN;
+ break;
+ }
} else {
lock_extent(io_tree, lockstart, lockend, cached_state);
}
@@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
cond_resched();
}
+ if (ret)
+ unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
return ret;
}
@@ -353,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
int ret = 0;
u64 len = length;
const u64 data_alloc_len = length;
- bool unlock_extents = false;
+ u32 unlock_bits = EXTENT_LOCKED;
/*
* We could potentially fault if we have a buffer > PAGE_SIZE, and if
@@ -514,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
start, &len, flags);
if (ret < 0)
goto unlock_err;
- unlock_extents = true;
/* Recalc len in case the new em is smaller than requested */
len = min(len, em->len - (start - em->start));
if (dio_data->data_space_reserved) {
@@ -535,22 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
release_offset,
release_len);
}
- } else {
- /*
- * We need to unlock only the end area that we aren't using.
- * The rest is going to be unlocked by the endio routine.
- */
- lockstart = start + len;
- if (lockstart < lockend)
- unlock_extents = true;
}
- if (unlock_extents)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- else
- free_extent_state(cached_state);
-
/*
* Translate extent map information to iomap.
* We trim the extents (and move the addr) even though iomap code does
@@ -569,11 +566,33 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->length = len;
free_extent_map(em);
+ /*
+ * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
+ * writes only hold it for this part. We hold the extent lock until
+ * we're completely done with the extent map to make sure it remains
+ * valid.
+ */
+ if (write)
+ unlock_bits |= EXTENT_DIO_LOCKED;
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, &cached_state);
+
+ /* We didn't use everything, unlock the dio extent for the remainder. */
+ if (!write && (start + len) < lockend)
+ unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
+ lockend, NULL);
+
return 0;
unlock_err:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ /*
+ * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
+ * to update this, be explicit that we expect EXTENT_LOCKED and
+ * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err:
if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -596,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
- NULL);
+ unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
return 0;
}
@@ -608,8 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
pos, length, false);
else
- unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
+ unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
ret = -ENOTBLK;
}
if (write) {
@@ -641,8 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
dip->file_offset, dip->bytes,
!bio->bi_status);
} else {
- unlock_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
+ unlock_dio_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
}
bbio->bio.bi_private = bbio->private;
@@ -726,7 +745,7 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
}
}
- btrfs_submit_bio(bbio, 0);
+ btrfs_submit_bbio(bbio, 0);
}
static const struct iomap_ops btrfs_dio_iomap_ops = {
@@ -864,13 +883,6 @@ again:
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else {
- struct btrfs_file_private stack_private = { 0 };
- struct btrfs_file_private *private;
- const bool have_private = (file->private_data != NULL);
-
- if (!have_private)
- file->private_data = &stack_private;
-
/*
* If we have a synchronous write, we must make sure the fsync
* triggered by the iomap_dio_complete() call below doesn't
@@ -879,13 +891,10 @@ again:
* partial writes due to the input buffer (or parts of it) not
* being already faulted in.
*/
- private = file->private_data;
- private->fsync_skip_inode_lock = true;
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
- private->fsync_skip_inode_lock = false;
-
- if (!have_private)
- file->private_data = NULL;
+ current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 944a7340f6a4..e815d165cccc 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
};
static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
- struct btrfs_block_group *block_group)
+ const struct btrfs_block_group *block_group)
{
return &discard_ctl->discard_list[block_group->discard_index];
}
@@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
*
* Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
-static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
+static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
{
struct btrfs_fs_info *fs_info = container_of(discard_ctl,
struct btrfs_fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a6f5441e62d1..25d768e67e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -525,7 +525,7 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return false;
- return try_release_extent_buffer(&folio->page);
+ return try_release_extent_buffer(folio);
}
static void btree_invalidate_folio(struct folio *folio, size_t offset,
@@ -1285,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
- kfree(fs_info->subpage_info);
kvfree(fs_info);
}
@@ -3322,6 +3321,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
@@ -3346,20 +3346,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
- if (sectorsize < PAGE_SIZE) {
- struct btrfs_subpage_info *subpage_info;
-
+ if (sectorsize < PAGE_SIZE)
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
- subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
- if (!subpage_info) {
- ret = -ENOMEM;
- goto fail_alloc;
- }
- btrfs_init_subpage_info(subpage_info, sectorsize);
- fs_info->subpage_info = subpage_info;
- }
ret = btrfs_init_workqueues(fs_info);
if (ret)
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index c54c5d7a5cd5..6d08c100b01d 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -126,7 +126,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
* Empty an io tree, removing and freeing every extent state record from the
* tree. This should be called once we are sure no other task can access the
* tree anymore, so no tree updates happen after we empty the tree and there
- * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
+ * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
* set on any extent state when calling this function).
*/
void extent_io_tree_release(struct extent_io_tree *tree)
@@ -141,7 +141,7 @@ void extent_io_tree_release(struct extent_io_tree *tree)
rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
/* Clear node to keep free_extent_state() happy. */
RB_CLEAR_NODE(&state->rb_node);
- ASSERT(!(state->state & EXTENT_LOCKED));
+ ASSERT(!(state->state & EXTENT_LOCK_BITS));
/*
* No need for a memory barrier here, as we are holding the tree
* lock and we only change the waitqueue while holding that lock
@@ -399,7 +399,7 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
*/
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
{
- if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
return;
merge_prev_state(tree, state);
@@ -445,7 +445,7 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
struct rb_node *parent = NULL;
const u64 start = state->start - 1;
const u64 end = state->end + 1;
- const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
+ const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
set_state_bits(tree, state, bits, changeset);
@@ -616,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
* inserting elements in the tree, so the gfp mask is used to indicate which
* allocations or sleeping are allowed.
*
- * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
- * range from the tree regardless of state (ie for truncate).
- *
* The range [start, end] is inclusive.
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
@@ -647,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
- wake = (bits & EXTENT_LOCKED) ? 1 : 0;
- if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+ wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0);
+ if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
clear = 1;
again:
if (!prealloc) {
@@ -861,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state,
static void cache_state(struct extent_state *state,
struct extent_state **cached_ptr)
{
- return cache_state_if_flags(state, cached_ptr,
- EXTENT_LOCKED | EXTENT_BOUNDARY);
+ return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY);
}
/*
@@ -1063,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int ret = 0;
u64 last_start;
u64 last_end;
- u32 exclusive_bits = (bits & EXTENT_LOCKED);
+ u32 exclusive_bits = (bits & EXTENT_LOCK_BITS);
gfp_t mask;
set_gfp_mask_from_bits(&bits, &mask);
@@ -1812,12 +1808,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
- * We don't support EXTENT_LOCKED yet, as current changeset will
- * record any bits changed, so for EXTENT_LOCKED case, it will
- * either fail with -EEXIST or changeset will record the whole
- * range.
+ * We don't support EXTENT_LOCK_BITS yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCK_BITS case, it will either
+ * fail with -EEXIST or changeset will record the whole range.
*/
- ASSERT(!(bits & EXTENT_LOCKED));
+ ASSERT(!(bits & EXTENT_LOCK_BITS));
return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}
@@ -1826,26 +1821,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
- * Don't support EXTENT_LOCKED case, same reason as
+ * Don't support EXTENT_LOCK_BITS case, same reason as
* set_record_extent_bits().
*/
- ASSERT(!(bits & EXTENT_LOCKED));
+ ASSERT(!(bits & EXTENT_LOCK_BITS));
return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
+bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached)
{
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ err = __set_extent_bit(tree, start, end, bits, &failed_start,
NULL, cached, NULL);
if (err == -EEXIST) {
if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, cached);
+ clear_extent_bit(tree, start, failed_start - 1, bits, cached);
return 0;
}
return 1;
@@ -1855,23 +1849,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
* Either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state)
+int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+ err = __set_extent_bit(tree, start, end, bits, &failed_start,
&failed_state, cached_state, NULL);
while (err == -EEXIST) {
if (failed_start != start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, cached_state);
+ bits, cached_state);
- wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
- &failed_state);
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ wait_extent_bit(tree, failed_start, end, bits, &failed_state);
+ err = __set_extent_bit(tree, start, end, bits,
&failed_start, &failed_state,
cached_state, NULL);
}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 9d3a52d8f59a..6ffef1cd37c1 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -19,6 +19,7 @@ enum {
ENUM_BIT(EXTENT_DIRTY),
ENUM_BIT(EXTENT_UPTODATE),
ENUM_BIT(EXTENT_LOCKED),
+ ENUM_BIT(EXTENT_DIO_LOCKED),
ENUM_BIT(EXTENT_NEW),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
@@ -67,6 +68,8 @@ enum {
EXTENT_ADD_INODE_BYTES | \
EXTENT_CLEAR_ALL_BITS)
+#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED)
+
/*
* Redefined bits above which are used only in the device allocation tree,
* shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
@@ -134,12 +137,22 @@ const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tre
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
+int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached);
+bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
+{
+ return __lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+}
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
+static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+}
int __init extent_state_init_cachep(void);
void __cold extent_state_free_cachep(void);
@@ -212,5 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
+static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+}
+
+static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+}
+
+static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL);
+}
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ff9f0d41987e..a5966324607d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5472,23 +5472,62 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 parent,
int level)
{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *head;
struct btrfs_path *path;
struct btrfs_extent_inline_ref *iref;
int ret;
+ bool exists = false;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-
+again:
ret = lookup_extent_backref(trans, path, &iref, bytenr,
root->fs_info->nodesize, parent,
btrfs_root_id(root), level, 0);
+ if (ret != -ENOENT) {
+ /*
+ * If we get 0 then we found our reference, return 1, else
+ * return the error if it's not -ENOENT;
+ */
+ btrfs_free_path(path);
+ return (ret < 0 ) ? ret : 1;
+ }
+
+ /*
+ * We could have a delayed ref with this reference, so look it up while
+ * we're holding the path open to make sure we don't race with the
+ * delayed ref running.
+ */
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ if (!head)
+ goto out;
+ if (!mutex_trylock(&head->mutex)) {
+ /*
+ * We're contended, means that the delayed ref is running, get a
+ * reference and wait for the ref head to be complete and then
+ * try again.
+ */
+ refcount_inc(&head->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref_head(head);
+ goto again;
+ }
+
+ exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
+ mutex_unlock(&head->mutex);
+out:
+ spin_unlock(&delayed_refs->lock);
btrfs_free_path(path);
- if (ret == -ENOENT)
- return 0;
- if (ret < 0)
- return ret;
- return 1;
+ return exists ? 1 : 0;
}
/*
@@ -6512,13 +6551,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
continue;
ret = btrfs_trim_free_extents(device, &group_trimmed);
+
+ trimmed += group_trimmed;
if (ret) {
dev_failed++;
dev_ret = ret;
break;
}
-
- trimmed += group_trimmed;
}
mutex_unlock(&fs_devices->device_list_mutex);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aa7f8148cd0d..39c9677c47d5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -101,6 +101,13 @@ struct btrfs_bio_ctrl {
blk_opf_t opf;
btrfs_bio_end_io_t end_io_func;
struct writeback_control *wbc;
+
+ /*
+ * The sectors of the page which are going to be submitted by
+ * extent_writepage_io().
+ * This is to avoid touching ranges covered by compression/inline.
+ */
+ unsigned long submit_bitmap;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
@@ -117,7 +124,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
btrfs_submit_compressed_read(bbio);
else
- btrfs_submit_bio(bbio, 0);
+ btrfs_submit_bbio(bbio, 0);
/* The bbio is owned by the end_io handler now */
bio_ctrl->bbio = NULL;
@@ -164,11 +171,10 @@ void __cold extent_buffer_free_cachep(void)
kmem_cache_destroy(extent_buffer_cache);
}
-static void process_one_page(struct btrfs_fs_info *fs_info,
- struct page *page, const struct page *locked_page,
- unsigned long page_ops, u64 start, u64 end)
+static void process_one_folio(struct btrfs_fs_info *fs_info,
+ struct folio *folio, const struct folio *locked_folio,
+ unsigned long page_ops, u64 start, u64 end)
{
- struct folio *folio = page_folio(page);
u32 len;
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
@@ -183,13 +189,13 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
if (page_ops & PAGE_END_WRITEBACK)
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
- if (page != locked_page && (page_ops & PAGE_UNLOCK))
+ if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
-static void __process_pages_contig(struct address_space *mapping,
- const struct page *locked_page, u64 start, u64 end,
- unsigned long page_ops)
+static void __process_folios_contig(struct address_space *mapping,
+ const struct folio *locked_folio, u64 start,
+ u64 end, unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
pgoff_t start_index = start >> PAGE_SHIFT;
@@ -207,8 +213,8 @@ static void __process_pages_contig(struct address_space *mapping,
for (i = 0; i < found_folios; i++) {
struct folio *folio = fbatch.folios[i];
- process_one_page(fs_info, &folio->page, locked_page,
- page_ops, start, end);
+ process_one_folio(fs_info, folio, locked_folio,
+ page_ops, start, end);
}
folio_batch_release(&fbatch);
cond_resched();
@@ -216,24 +222,23 @@ static void __process_pages_contig(struct address_space *mapping,
}
static noinline void __unlock_for_delalloc(const struct inode *inode,
- const struct page *locked_page,
+ const struct folio *locked_folio,
u64 start, u64 end)
{
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- ASSERT(locked_page);
- if (index == locked_page->index && end_index == index)
+ ASSERT(locked_folio);
+ if (index == locked_folio->index && end_index == index)
return;
- __process_pages_contig(inode->i_mapping, locked_page, start, end,
- PAGE_UNLOCK);
+ __process_folios_contig(inode->i_mapping, locked_folio, start, end,
+ PAGE_UNLOCK);
}
-static noinline int lock_delalloc_pages(struct inode *inode,
- const struct page *locked_page,
- u64 start,
- u64 end)
+static noinline int lock_delalloc_folios(struct inode *inode,
+ const struct folio *locked_folio,
+ u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct address_space *mapping = inode->i_mapping;
@@ -243,7 +248,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 processed_end = start;
struct folio_batch fbatch;
- if (index == locked_page->index && index == end_index)
+ if (index == locked_folio->index && index == end_index)
return 0;
folio_batch_init(&fbatch);
@@ -257,23 +262,22 @@ static noinline int lock_delalloc_pages(struct inode *inode,
for (i = 0; i < found_folios; i++) {
struct folio *folio = fbatch.folios[i];
- struct page *page = folio_page(folio, 0);
u32 len = end + 1 - start;
- if (page == locked_page)
+ if (folio == locked_folio)
continue;
if (btrfs_folio_start_writer_lock(fs_info, folio, start,
len))
goto out;
- if (!PageDirty(page) || page->mapping != mapping) {
+ if (!folio_test_dirty(folio) || folio->mapping != mapping) {
btrfs_folio_end_writer_lock(fs_info, folio, start,
len);
goto out;
}
- processed_end = page_offset(page) + PAGE_SIZE - 1;
+ processed_end = folio_pos(folio) + folio_size(folio) - 1;
}
folio_batch_release(&fbatch);
cond_resched();
@@ -283,7 +287,8 @@ static noinline int lock_delalloc_pages(struct inode *inode,
out:
folio_batch_release(&fbatch);
if (processed_end > start)
- __unlock_for_delalloc(inode, locked_page, start, processed_end);
+ __unlock_for_delalloc(inode, locked_folio, start,
+ processed_end);
return -EAGAIN;
}
@@ -304,8 +309,8 @@ out:
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
- struct page *locked_page, u64 *start,
- u64 *end)
+ struct folio *locked_folio,
+ u64 *start, u64 *end)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -323,9 +328,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
/* Caller should pass a valid @end to indicate the search range end */
ASSERT(orig_end > orig_start);
- /* The range should at least cover part of the page */
- ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
- orig_end <= page_offset(locked_page)));
+ /* The range should at least cover part of the folio */
+ ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) ||
+ orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
@@ -342,25 +347,25 @@ again:
}
/*
- * start comes from the offset of locked_page. We have to lock
- * pages in order, so we can't process delalloc bytes before
- * locked_page
+ * start comes from the offset of locked_folio. We have to lock
+ * folios in order, so we can't process delalloc bytes before
+ * locked_folio
*/
if (delalloc_start < *start)
delalloc_start = *start;
/*
- * make sure to limit the number of pages we try to lock down
+ * make sure to limit the number of folios we try to lock down
*/
if (delalloc_end + 1 - delalloc_start > max_bytes)
delalloc_end = delalloc_start + max_bytes - 1;
- /* step two, lock all the pages after the page that has start */
- ret = lock_delalloc_pages(inode, locked_page,
- delalloc_start, delalloc_end);
+ /* step two, lock all the folioss after the folios that has start */
+ ret = lock_delalloc_folios(inode, locked_folio, delalloc_start,
+ delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
- /* some of the pages are gone, lets avoid looping by
+ /* some of the folios are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
@@ -384,8 +389,8 @@ again:
unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
- __unlock_for_delalloc(inode, locked_page,
- delalloc_start, delalloc_end);
+ __unlock_for_delalloc(inode, locked_folio, delalloc_start,
+ delalloc_end);
cond_resched();
goto again;
}
@@ -396,40 +401,41 @@ out_failed:
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- const struct page *locked_page,
+ const struct folio *locked_folio,
struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
- __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
- start, end, page_ops);
+ __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start,
+ end, page_ops);
}
-static bool btrfs_verify_page(struct page *page, u64 start)
+static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len)
{
- if (!fsverity_active(page->mapping->host) ||
- PageUptodate(page) ||
- start >= i_size_read(page->mapping->host))
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+
+ if (!fsverity_active(folio->mapping->host) ||
+ btrfs_folio_test_uptodate(fs_info, folio, start, len) ||
+ start >= i_size_read(folio->mapping->host))
return true;
- return fsverity_verify_page(page);
+ return fsverity_verify_folio(folio);
}
-static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len)
{
- struct btrfs_fs_info *fs_info = page_to_fs_info(page);
- struct folio *folio = page_folio(page);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_pos(folio) + PAGE_SIZE);
- if (uptodate && btrfs_verify_page(page, start))
+ if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, page->mapping))
- unlock_page(page);
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
+ folio_unlock(folio);
else
btrfs_subpage_end_reader(fs_info, folio, start, len);
}
@@ -471,8 +477,8 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
"incomplete page write with offset %zu and length %zu",
fi.offset, fi.length);
- btrfs_finish_ordered_extent(bbio->ordered,
- folio_page(folio, 0), start, len, !error);
+ btrfs_finish_ordered_extent(bbio->ordered, folio, start, len,
+ !error);
if (error)
mapping_set_error(folio->mapping, error);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
@@ -481,85 +487,14 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
bio_put(bio);
}
-/*
- * Record previously processed extent range
- *
- * For endio_readpage_release_extent() to handle a full extent range, reducing
- * the extent io operations.
- */
-struct processed_extent {
- struct btrfs_inode *inode;
- /* Start of the range in @inode */
- u64 start;
- /* End of the range in @inode */
- u64 end;
- bool uptodate;
-};
-
-/*
- * Try to release processed extent range
- *
- * May not release the extent range right now if the current range is
- * contiguous to processed extent.
- *
- * Will release processed extent when any of @inode, @uptodate, the range is
- * no longer contiguous to the processed range.
- *
- * Passing @inode == NULL will force processed extent to be released.
- */
-static void endio_readpage_release_extent(struct processed_extent *processed,
- struct btrfs_inode *inode, u64 start, u64 end,
- bool uptodate)
-{
- struct extent_state *cached = NULL;
- struct extent_io_tree *tree;
-
- /* The first extent, initialize @processed */
- if (!processed->inode)
- goto update;
-
- /*
- * Contiguous to processed extent, just uptodate the end.
- *
- * Several things to notice:
- *
- * - bio can be merged as long as on-disk bytenr is contiguous
- * This means we can have page belonging to other inodes, thus need to
- * check if the inode still matches.
- * - bvec can contain range beyond current page for multi-page bvec
- * Thus we need to do processed->end + 1 >= start check
- */
- if (processed->inode == inode && processed->uptodate == uptodate &&
- processed->end + 1 >= start && end >= processed->end) {
- processed->end = end;
- return;
- }
-
- tree = &processed->inode->io_tree;
- /*
- * Now we don't have range contiguous to the processed range, release
- * the processed range now.
- */
- unlock_extent(tree, processed->start, processed->end, &cached);
-
-update:
- /* Update processed to current range */
- processed->inode = inode;
- processed->start = start;
- processed->end = end;
- processed->uptodate = uptodate;
-}
-
-static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
+static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
ASSERT(folio_test_locked(folio));
if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
ASSERT(folio_test_private(folio));
- btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE);
+ btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE);
}
/*
@@ -578,7 +513,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
{
struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
- struct processed_extent processed = { 0 };
struct folio_iter fi;
const u32 sectorsize = fs_info->sectorsize;
@@ -642,12 +576,8 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/* Update page status and unlock. */
- end_page_read(folio_page(folio, 0), uptodate, start, len);
- endio_readpage_release_extent(&processed, BTRFS_I(inode),
- start, end, uptodate);
+ end_folio_read(folio, uptodate, start, len);
}
- /* Release the last extent */
- endio_readpage_release_extent(&processed, NULL, 0, 0, false);
bio_put(bio);
}
@@ -737,12 +667,13 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
}
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
- struct page *page, u64 disk_bytenr,
+ struct folio *folio, u64 disk_bytenr,
unsigned int pg_offset)
{
struct bio *bio = &bio_ctrl->bbio->bio;
struct bio_vec *bvec = bio_last_bvec_all(bio);
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ struct folio *bv_folio = page_folio(bvec->bv_page);
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
/*
@@ -755,7 +686,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
/*
* The contig check requires the following conditions to be met:
*
- * 1) The pages are belonging to the same inode
+ * 1) The folios are belonging to the same inode
* This is implied by the call chain.
*
* 2) The range has adjacent logical bytenr
@@ -764,8 +695,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
* This is required for the usage of btrfs_bio->file_offset.
*/
return bio_end_sector(bio) == sector &&
- page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
- page_offset(page) + pg_offset;
+ folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len ==
+ folio_pos(folio) + pg_offset;
}
static void alloc_new_bio(struct btrfs_inode *inode,
@@ -818,17 +749,17 @@ static void alloc_new_bio(struct btrfs_inode *inode,
* The mirror number for this IO should already be initizlied in
* @bio_ctrl->mirror_num.
*/
-static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
- u64 disk_bytenr, struct page *page,
+static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
+ u64 disk_bytenr, struct folio *folio,
size_t size, unsigned long pg_offset)
{
- struct btrfs_inode *inode = page_to_inode(page);
+ struct btrfs_inode *inode = folio_to_inode(folio);
ASSERT(pg_offset + size <= PAGE_SIZE);
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
- !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
+ !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset))
submit_one_bio(bio_ctrl);
do {
@@ -837,7 +768,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
/* Allocate new bio if needed */
if (!bio_ctrl->bbio) {
alloc_new_bio(inode, bio_ctrl, disk_bytenr,
- page_offset(page) + pg_offset);
+ folio_pos(folio) + pg_offset);
}
/* Cap to the current ordered extent boundary if there is one. */
@@ -847,21 +778,22 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
len = bio_ctrl->len_to_oe_boundary;
}
- if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
+ if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) {
/* bio full: move on to a new one */
submit_one_bio(bio_ctrl);
continue;
}
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);
+ wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
+ len);
size -= len;
pg_offset += len;
disk_bytenr += len;
/*
- * len_to_oe_boundary defaults to U32_MAX, which isn't page or
+ * len_to_oe_boundary defaults to U32_MAX, which isn't folio or
* sector aligned. alloc_new_bio() then sets it to the end of
* our ordered extent for writes into zoned devices.
*
@@ -871,15 +803,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
* boundary is correct.
*
* When len_to_oe_boundary is U32_MAX, the cap above would
- * result in a 4095 byte IO for the last page right before
- * we hit the bio limit of UINT_MAX. bio_add_page() has all
+ * result in a 4095 byte IO for the last folio right before
+ * we hit the bio limit of UINT_MAX. bio_add_folio() has all
* the checks required to make sure we don't overflow the bio,
* and we should just ignore len_to_oe_boundary completely
* unless we're using it to track an ordered extent.
*
* It's pretty hard to make a bio sized U32_MAX, but it can
* happen when the page cache is able to feed us contiguous
- * pages for large extents.
+ * folios for large extents.
*/
if (bio_ctrl->len_to_oe_boundary != U32_MAX)
bio_ctrl->len_to_oe_boundary -= len;
@@ -952,27 +884,28 @@ int set_folio_extent_mapped(struct folio *folio)
return 0;
}
-void clear_page_extent_mapped(struct page *page)
+void clear_folio_extent_mapped(struct folio *folio)
{
- struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
- ASSERT(page->mapping);
+ ASSERT(folio->mapping);
if (!folio_test_private(folio))
return;
- fs_info = page_to_fs_info(page);
- if (btrfs_is_subpage(fs_info, page->mapping))
+ fs_info = folio_to_fs_info(folio);
+ if (btrfs_is_subpage(fs_info, folio->mapping))
return btrfs_detach_subpage(fs_info, folio);
folio_detach_private(folio);
}
-static struct extent_map *__get_extent_map(struct inode *inode, struct page *page,
- u64 start, u64 len, struct extent_map **em_cached)
+static struct extent_map *__get_extent_map(struct inode *inode,
+ struct folio *folio, u64 start,
+ u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
+ struct extent_state *cached_state = NULL;
ASSERT(em_cached);
@@ -988,12 +921,15 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag
*em_cached = NULL;
}
- em = btrfs_get_extent(BTRFS_I(inode), page, start, len);
+ btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state);
+ em = btrfs_get_extent(BTRFS_I(inode), folio, start, len);
if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
}
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state);
+
return em;
}
/*
@@ -1003,12 +939,12 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- u64 start = page_offset(page);
+ u64 start = folio_pos(folio);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
@@ -1019,25 +955,23 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = fs_info->sectorsize;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_extent(tree, start, end, NULL);
- unlock_page(page);
+ folio_unlock(folio);
return ret;
}
- if (page->index == last_byte >> PAGE_SHIFT) {
- size_t zero_offset = offset_in_page(last_byte);
+ if (folio->index == last_byte >> folio_shift(folio)) {
+ size_t zero_offset = offset_in_folio(folio, last_byte);
if (zero_offset) {
- iosize = PAGE_SIZE - zero_offset;
- memzero_page(page, zero_offset, iosize);
+ iosize = folio_size(folio) - zero_offset;
+ folio_zero_range(folio, zero_offset, iosize);
}
}
bio_ctrl->end_io_func = end_bbio_data_read;
- begin_page_read(fs_info, page);
+ begin_folio_read(fs_info, folio);
while (cur <= end) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
bool force_bio_submit = false;
@@ -1045,16 +979,15 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- iosize = PAGE_SIZE - pg_offset;
- memzero_page(page, pg_offset, iosize);
- unlock_extent(tree, cur, cur + iosize - 1, NULL);
- end_page_read(page, true, cur, iosize);
+ iosize = folio_size(folio) - pg_offset;
+ folio_zero_range(folio, pg_offset, iosize);
+ end_folio_read(folio, true, cur, iosize);
break;
}
- em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached);
+ em = __get_extent_map(inode, folio, cur, end - cur + 1,
+ em_cached);
if (IS_ERR(em)) {
- unlock_extent(tree, cur, end, NULL);
- end_page_read(page, false, cur, end + 1 - cur);
+ end_folio_read(folio, false, cur, end + 1 - cur);
return PTR_ERR(em);
}
extent_offset = cur - em->start;
@@ -1079,8 +1012,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
- * single bio to populate the pages for the 2 ranges because
- * this makes the compressed extent read zero out the pages
+ * single bio to populate the folios for the 2 ranges because
+ * this makes the compressed extent read zero out the folios
* belonging to the 2nd range. Imagine the following scenario:
*
* File layout
@@ -1093,13 +1026,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* [extent X, compressed length = 4K uncompressed length = 16K]
*
* If the bio to read the compressed extent covers both ranges,
- * it will decompress extent X into the pages belonging to the
+ * it will decompress extent X into the folios belonging to the
* first range and then it will stop, zeroing out the remaining
- * pages that belong to the other range that points to extent X.
+ * folios that belong to the other range that points to extent X.
* So here we make sure we submit 2 bios, one for the first
* range and another one for the third range. Both will target
* the same physical extent from disk, but we can't currently
- * make the compressed bio endio callback populate the pages
+ * make the compressed bio endio callback populate the folios
* for both ranges because each compressed bio is tightly
* coupled with a single extent map, and each range can have
* an extent map with a different offset value relative to the
@@ -1120,18 +1053,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- memzero_page(page, pg_offset, iosize);
+ folio_zero_range(folio, pg_offset, iosize);
- unlock_extent(tree, cur, cur + iosize - 1, NULL);
- end_page_read(page, true, cur, iosize);
+ end_folio_read(folio, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
- /* the get_extent function already copied into the page */
+ /* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- unlock_extent(tree, cur, cur + iosize - 1, NULL);
- end_page_read(page, true, cur, iosize);
+ end_folio_read(folio, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -1144,8 +1075,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
- pg_offset);
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize,
+ pg_offset);
cur = cur + iosize;
pg_offset += iosize;
}
@@ -1155,17 +1086,11 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int btrfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct btrfs_inode *inode = page_to_inode(page);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
struct extent_map *em_cached = NULL;
int ret;
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL);
+ ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
free_extent_map(em_cached);
/*
@@ -1176,28 +1101,8 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static inline void contiguous_readpages(struct page *pages[], int nr_pages,
- u64 start, u64 end,
- struct extent_map **em_cached,
- struct btrfs_bio_ctrl *bio_ctrl,
- u64 *prev_em_start)
-{
- struct btrfs_inode *inode = page_to_inode(pages[0]);
- int index;
-
- ASSERT(em_cached);
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- for (index = 0; index < nr_pages; index++) {
- btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
- prev_em_start);
- put_page(pages[index]);
- }
-}
-
/*
- * helper for __extent_writepage, doing all of the delayed allocation setup.
+ * helper for extent_writepage(), doing all of the delayed allocation setup.
*
* This returns 1 if btrfs_run_delalloc_range function did all the work required
* to write the page (copy into inline extent). In this case the IO has
@@ -1207,13 +1112,14 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
* This returns < 0 if there were errors (page still locked)
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
- struct page *page, struct writeback_control *wbc)
+ struct folio *folio,
+ struct btrfs_bio_ctrl *bio_ctrl)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
- struct folio *folio = page_folio(page);
- const bool is_subpage = btrfs_is_subpage(fs_info, page->mapping);
- const u64 page_start = page_offset(page);
- const u64 page_end = page_start + PAGE_SIZE - 1;
+ struct writeback_control *wbc = bio_ctrl->wbc;
+ const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
+ const u64 page_start = folio_pos(folio);
+ const u64 page_end = page_start + folio_size(folio) - 1;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
* page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1225,10 +1131,18 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
u64 delalloc_to_write = 0;
int ret = 0;
- /* Lock all (subpage) delalloc ranges inside the page first. */
+ /* Save the dirty bitmap as our submission bitmap will be a subset of it. */
+ if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
+ ASSERT(fs_info->sectors_per_page > 1);
+ btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
+ } else {
+ bio_ctrl->submit_bitmap = 1;
+ }
+
+ /* Lock all (subpage) delalloc ranges inside the folio first. */
while (delalloc_start < page_end) {
delalloc_end = page_end;
- if (!find_lock_delalloc_range(&inode->vfs_inode, page,
+ if (!find_lock_delalloc_range(&inode->vfs_inode, folio,
&delalloc_start, &delalloc_end)) {
delalloc_start = delalloc_end + 1;
continue;
@@ -1253,7 +1167,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
if (!is_subpage) {
/*
* For non-subpage case, the found delalloc range must
- * cover this page and there must be only one locked
+ * cover this folio and there must be only one locked
* delalloc range.
*/
found_start = page_start;
@@ -1267,7 +1181,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
break;
/*
* The subpage range covers the last sector, the delalloc range may
- * end beyond the page boundary, use the saved delalloc_end
+ * end beyond the folio boundary, use the saved delalloc_end
* instead.
*/
if (found_start + found_len >= page_end)
@@ -1275,7 +1189,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
if (ret >= 0) {
/* No errors hit so far, run the current delalloc range. */
- ret = btrfs_run_delalloc_range(inode, page, found_start,
+ ret = btrfs_run_delalloc_range(inode, folio,
+ found_start,
found_start + found_len - 1,
wbc);
} else {
@@ -1285,30 +1200,27 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
*/
unlock_extent(&inode->io_tree, found_start,
found_start + found_len - 1, NULL);
- __unlock_for_delalloc(&inode->vfs_inode, page, found_start,
+ __unlock_for_delalloc(&inode->vfs_inode, folio,
+ found_start,
found_start + found_len - 1);
}
/*
- * We can hit btrfs_run_delalloc_range() with >0 return value.
- *
- * This happens when either the IO is already done and page
- * unlocked (inline) or the IO submission and page unlock would
- * be handled as async (compression).
- *
- * Inline is only possible for regular sectorsize for now.
- *
- * Compression is possible for both subpage and regular cases,
- * but even for subpage compression only happens for page aligned
- * range, thus the found delalloc range must go beyond current
- * page.
+ * We have some ranges that's going to be submitted asynchronously
+ * (compression or inline). These range have their own control
+ * on when to unlock the pages. We should not touch them
+ * anymore, so clear the range from the submission bitmap.
*/
- if (ret > 0)
- ASSERT(!is_subpage || found_start + found_len >= page_end);
-
+ if (ret > 0) {
+ unsigned int start_bit = (found_start - page_start) >>
+ fs_info->sectorsize_bits;
+ unsigned int end_bit = (min(page_end + 1, found_start + found_len) -
+ page_start) >> fs_info->sectorsize_bits;
+ bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit);
+ }
/*
- * Above btrfs_run_delalloc_range() may have unlocked the page,
- * thus for the last range, we cannot touch the page anymore.
+ * Above btrfs_run_delalloc_range() may have unlocked the folio,
+ * thus for the last range, we cannot touch the folio anymore.
*/
if (found_start + found_len >= last_delalloc_end + 1)
break;
@@ -1330,10 +1242,10 @@ out:
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
/*
- * If btrfs_run_dealloc_range() already started I/O and unlocked
- * the pages, we just need to account for them here.
+ * If all ranges are submitted asynchronously, we just need to account
+ * for them here.
*/
- if (ret == 1) {
+ if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1351,182 +1263,148 @@ out:
}
/*
- * Find the first byte we need to write.
+ * Return 0 if we have submitted or queued the sector for submission.
+ * Return <0 for critical errors.
*
- * For subpage, one page can contain several sectors, and
- * __extent_writepage_io() will just grab all extent maps in the page
- * range and try to submit all non-inline/non-compressed extents.
- *
- * This is a big problem for subpage, we shouldn't re-submit already written
- * data at all.
- * This function will lookup subpage dirty bit to find which range we really
- * need to submit.
- *
- * Return the next dirty range in [@start, @end).
- * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
+ * Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
-static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 *start, u64 *end)
+static int submit_one_sector(struct btrfs_inode *inode,
+ struct folio *folio,
+ u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
+ loff_t i_size)
{
- struct folio *folio = page_folio(page);
- struct btrfs_subpage *subpage = folio_get_private(folio);
- struct btrfs_subpage_info *spi = fs_info->subpage_info;
- u64 orig_start = *start;
- /* Declare as unsigned long so we can use bitmap ops */
- unsigned long flags;
- int range_start_bit;
- int range_end_bit;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map *em;
+ u64 block_start;
+ u64 disk_bytenr;
+ u64 extent_offset;
+ u64 em_end;
+ const u32 sectorsize = fs_info->sectorsize;
- /*
- * For regular sector size == page size case, since one page only
- * contains one sector, we return the page offset directly.
- */
- if (!btrfs_is_subpage(fs_info, page->mapping)) {
- *start = page_offset(page);
- *end = page_offset(page) + PAGE_SIZE;
- return;
- }
+ ASSERT(IS_ALIGNED(filepos, sectorsize));
+
+ /* @filepos >= i_size case should be handled by the caller. */
+ ASSERT(filepos < i_size);
- range_start_bit = spi->dirty_offset +
- (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+ em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
+ if (IS_ERR(em))
+ return PTR_ERR_OR_ZERO(em);
- /* We should have the page locked, but just in case */
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
- spi->dirty_offset + spi->bitmap_nr_bits);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ extent_offset = filepos - em->start;
+ em_end = extent_map_end(em);
+ ASSERT(filepos <= em_end);
+ ASSERT(IS_ALIGNED(em->start, sectorsize));
+ ASSERT(IS_ALIGNED(em->len, sectorsize));
- range_start_bit -= spi->dirty_offset;
- range_end_bit -= spi->dirty_offset;
+ block_start = extent_map_block_start(em);
+ disk_bytenr = extent_map_block_start(em) + extent_offset;
- *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
- *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
+ ASSERT(!extent_map_is_compressed(em));
+ ASSERT(block_start != EXTENT_MAP_HOLE);
+ ASSERT(block_start != EXTENT_MAP_INLINE);
+
+ free_extent_map(em);
+ em = NULL;
+
+ btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
+ /*
+ * Above call should set the whole folio with writeback flag, even
+ * just for a single subpage sector.
+ * As long as the folio is properly locked and the range is correct,
+ * we should always get the folio with writeback flag.
+ */
+ ASSERT(folio_test_writeback(folio));
+
+ /*
+ * Although the PageDirty bit is cleared before entering this
+ * function, subpage dirty bit is not cleared.
+ * So clear subpage dirty bit here so next time we won't submit
+ * folio for range already written to disk.
+ */
+ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio,
+ sectorsize, filepos - folio_pos(folio));
+ return 0;
}
/*
- * helper for __extent_writepage. This calls the writepage start hooks,
+ * Helper for extent_writepage(). This calls the writepage start hooks,
* and does the loop to map the page into extents and bios.
*
* We return 1 if the IO is started and the page is unlocked,
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
-static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
- struct page *page, u64 start, u32 len,
- struct btrfs_bio_ctrl *bio_ctrl,
- loff_t i_size,
- int *nr_ret)
+static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
+ struct folio *folio,
+ u64 start, u32 len,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ loff_t i_size)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 cur = start;
- u64 end = start + len - 1;
- u64 extent_offset;
- u64 block_start;
- struct extent_map *em;
+ unsigned long range_bitmap = 0;
+ bool submitted_io = false;
+ const u64 folio_start = folio_pos(folio);
+ u64 cur;
+ int bit;
int ret = 0;
- int nr = 0;
- ASSERT(start >= page_offset(page) &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ ASSERT(start >= folio_start &&
+ start + len <= folio_start + folio_size(folio));
- ret = btrfs_writepage_cow_fixup(page);
+ ret = btrfs_writepage_cow_fixup(folio);
if (ret) {
/* Fixup worker will requeue */
- redirty_page_for_writepage(bio_ctrl->wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(bio_ctrl->wbc, folio);
+ folio_unlock(folio);
return 1;
}
+ for (cur = start; cur < start + len; cur += fs_info->sectorsize)
+ set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
+ bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
+ fs_info->sectors_per_page);
+
bio_ctrl->end_io_func = end_bbio_data_write;
- while (cur <= end) {
- u32 len = end - cur + 1;
- u64 disk_bytenr;
- u64 em_end;
- u64 dirty_range_start = cur;
- u64 dirty_range_end;
- u32 iosize;
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
- btrfs_mark_ordered_io_finished(inode, page, cur, len,
- true);
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ start + len - cur, true);
/*
* This range is beyond i_size, thus we don't need to
* bother writing back.
* But we still need to clear the dirty subpage bit, or
- * the next time the page gets dirtied, we will try to
+ * the next time the folio gets dirtied, we will try to
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len);
+ btrfs_folio_clear_dirty(fs_info, folio, cur,
+ start + len - cur);
break;
}
-
- find_next_dirty_byte(fs_info, page, &dirty_range_start,
- &dirty_range_end);
- if (cur < dirty_range_start) {
- cur = dirty_range_start;
- continue;
- }
-
- em = btrfs_get_extent(inode, NULL, cur, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR_OR_ZERO(em);
- goto out_error;
- }
-
- extent_offset = cur - em->start;
- em_end = extent_map_end(em);
- ASSERT(cur <= em_end);
- ASSERT(cur < end);
- ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
- ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
-
- block_start = extent_map_block_start(em);
- disk_bytenr = extent_map_block_start(em) + extent_offset;
-
- ASSERT(!extent_map_is_compressed(em));
- ASSERT(block_start != EXTENT_MAP_HOLE);
- ASSERT(block_start != EXTENT_MAP_INLINE);
-
- /*
- * Note that em_end from extent_map_end() and dirty_range_end from
- * find_next_dirty_byte() are all exclusive
- */
- iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
- free_extent_map(em);
- em = NULL;
-
- btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
- if (!PageWriteback(page)) {
- btrfs_err(inode->root->fs_info,
- "page %lu not writeback, cur %llu end %llu",
- page->index, cur, end);
- }
-
- /*
- * Although the PageDirty bit is cleared before entering this
- * function, subpage dirty bit is not cleared.
- * So clear subpage dirty bit here so next time we won't submit
- * page for range already written to disk.
- */
- btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
-
- submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
- cur - page_offset(page));
- cur += iosize;
- nr++;
+ ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
+ if (ret < 0)
+ goto out;
+ submitted_io = true;
}
- btrfs_folio_assert_not_dirty(fs_info, page_folio(page), start, len);
- *nr_ret = nr;
- return 0;
-
-out_error:
+ btrfs_folio_assert_not_dirty(fs_info, folio, start, len);
+out:
/*
- * If we finish without problem, we should not only clear page dirty,
- * but also empty subpage dirty bits
+ * If we didn't submitted any sector (>= i_size), folio dirty get
+ * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
+ * by folio_start_writeback() if the folio is not dirty).
+ *
+ * Here we set writeback and clear for the range. If the full folio
+ * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*/
- *nr_ret = nr;
+ if (!submitted_io) {
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
+ }
return ret;
}
@@ -1539,62 +1417,65 @@ out_error:
* Return 0 if everything goes well.
* Return <0 for error.
*/
-static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
+static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
- const u64 page_start = page_offset(page);
+ struct inode *inode = folio->mapping->host;
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ const u64 page_start = folio_pos(folio);
int ret;
- int nr = 0;
size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- trace___extent_writepage(page, inode, bio_ctrl->wbc);
+ trace_extent_writepage(folio, inode, bio_ctrl->wbc);
- WARN_ON(!PageLocked(page));
+ WARN_ON(!folio_test_locked(folio));
- pg_offset = offset_in_page(i_size);
- if (page->index > end_index ||
- (page->index == end_index && !pg_offset)) {
+ pg_offset = offset_in_folio(folio, i_size);
+ if (folio->index > end_index ||
+ (folio->index == end_index && !pg_offset)) {
folio_invalidate(folio, 0, folio_size(folio));
folio_unlock(folio);
return 0;
}
- if (page->index == end_index)
- memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
+ if (folio->index == end_index)
+ folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
- ret = set_page_extent_mapped(page);
+ /*
+ * Default to unlock the whole folio.
+ * The proper bitmap can only be initialized until writepage_delalloc().
+ */
+ bio_ctrl->submit_bitmap = (unsigned long)-1;
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto done;
- ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
+ ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl);
if (ret == 1)
return 0;
if (ret)
goto done;
- ret = __extent_writepage_io(BTRFS_I(inode), page, page_offset(page),
- PAGE_SIZE, bio_ctrl, i_size, &nr);
+ ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio),
+ PAGE_SIZE, bio_ctrl, i_size);
if (ret == 1)
return 0;
bio_ctrl->wbc->nr_to_write--;
done:
- if (nr == 0) {
- /* make sure the mapping tag for page dirty gets cleared */
- set_page_writeback(page);
- end_page_writeback(page);
- }
if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start,
- PAGE_SIZE, !ret);
- mapping_set_error(page->mapping, ret);
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
+ page_start, PAGE_SIZE, !ret);
+ mapping_set_error(folio->mapping, ret);
}
- btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio);
+ /*
+ * Only unlock ranges that are submitted. As there can be some async
+ * submitted ranges inside the folio.
+ */
+ btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0);
return ret;
}
@@ -1846,7 +1727,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
folio_unlock(folio);
}
}
- btrfs_submit_bio(bbio, 0);
+ btrfs_submit_bbio(bbio, 0);
}
/*
@@ -1863,17 +1744,16 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
* Return >=0 for the number of submitted extent buffers.
* Return <0 for fatal error.
*/
-static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
+static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
{
- struct btrfs_fs_info *fs_info = page_to_fs_info(page);
- struct folio *folio = page_folio(page);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
int submitted = 0;
- u64 page_start = page_offset(page);
+ u64 folio_start = folio_pos(folio);
int bit_start = 0;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
/* Lock and write each dirty extent buffers in the range */
- while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
+ while (bit_start < fs_info->sectors_per_page) {
struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
@@ -1883,21 +1763,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
- spin_lock(&page->mapping->i_private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
if (!folio_test_private(folio)) {
- spin_unlock(&page->mapping->i_private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+ if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->i_private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
bit_start++;
continue;
}
- start = page_start + bit_start * fs_info->sectorsize;
+ start = folio_start + bit_start * fs_info->sectorsize;
bit_start += sectors_per_node;
/*
@@ -1906,7 +1786,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->i_private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
@@ -1945,19 +1825,18 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* previous call.
* Return <0 for fatal error.
*/
-static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
+static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx)
{
struct writeback_control *wbc = ctx->wbc;
- struct address_space *mapping = page->mapping;
- struct folio *folio = page_folio(page);
+ struct address_space *mapping = folio->mapping;
struct extent_buffer *eb;
int ret;
if (!folio_test_private(folio))
return 0;
- if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(page, wbc);
+ if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ return submit_eb_subpage(folio, wbc);
spin_lock(&mapping->i_private_lock);
if (!folio_test_private(folio)) {
@@ -2055,7 +1934,7 @@ retry:
for (i = 0; i < nr_folios; i++) {
struct folio *folio = fbatch.folios[i];
- ret = submit_eb_page(&folio->page, &ctx);
+ ret = submit_eb_page(folio, &ctx);
if (ret == 0)
continue;
if (ret < 0) {
@@ -2109,7 +1988,7 @@ retry:
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*
- * We can get ret > 0 from submit_extent_page() indicating how many ebs
+ * We can get ret > 0 from submit_extent_folio() indicating how many ebs
* were submitted. Reset it to 0 to avoid false alerts for the caller.
*/
if (ret > 0)
@@ -2248,7 +2127,7 @@ retry:
continue;
}
- ret = __extent_writepage(&folio->page, bio_ctrl);
+ ret = extent_writepage(folio, bio_ctrl);
if (ret < 0) {
done = 1;
break;
@@ -2295,7 +2174,7 @@ retry:
* already been ran (aka, ordered extent inserted) and all pages are still
* locked.
*/
-void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
@@ -2319,37 +2198,46 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa
while (cur <= end) {
u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
u32 cur_len = cur_end + 1 - cur;
- struct page *page;
- int nr = 0;
+ struct folio *folio;
- page = find_get_page(mapping, cur >> PAGE_SHIFT);
- ASSERT(PageLocked(page));
- if (pages_dirty && page != locked_page)
- ASSERT(PageDirty(page));
+ folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0);
- ret = __extent_writepage_io(BTRFS_I(inode), page, cur, cur_len,
- &bio_ctrl, i_size, &nr);
+ /*
+ * This shouldn't happen, the pages are pinned and locked, this
+ * code is just in case, but shouldn't actually be run.
+ */
+ if (IS_ERR(folio)) {
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+ cur, cur_len, false);
+ mapping_set_error(mapping, PTR_ERR(folio));
+ cur = cur_end + 1;
+ continue;
+ }
+
+ ASSERT(folio_test_locked(folio));
+ if (pages_dirty && folio != locked_folio)
+ ASSERT(folio_test_dirty(folio));
+
+ /*
+ * Set the submission bitmap to submit all sectors.
+ * extent_writepage_io() will do the truncation correctly.
+ */
+ bio_ctrl.submit_bitmap = (unsigned long)-1;
+ ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len,
+ &bio_ctrl, i_size);
if (ret == 1)
goto next_page;
- /* Make sure the mapping tag for page dirty gets cleared. */
- if (nr == 0) {
- struct folio *folio;
-
- folio = page_folio(page);
- btrfs_folio_set_writeback(fs_info, folio, cur, cur_len);
- btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len);
- }
if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
+ btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
cur, cur_len, !ret);
- mapping_set_error(page->mapping, ret);
+ mapping_set_error(mapping, ret);
}
- btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len);
+ btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
- put_page(page);
+ folio_put(folio);
cur = cur_end + 1;
}
@@ -2379,18 +2267,12 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb
void btrfs_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
- struct page *pagepool[16];
+ struct folio *folio;
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
- int nr;
-
- while ((nr = readahead_page_batch(rac, pagepool))) {
- u64 contig_start = readahead_pos(rac);
- u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
- contiguous_readpages(pagepool, nr, contig_start, contig_end,
- &em_cached, &bio_ctrl, &prev_em_start);
- }
+ while ((folio = readahead_folio(rac)) != NULL)
+ btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
@@ -2435,9 +2317,9 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* to drop the page.
*/
static bool try_release_extent_state(struct extent_io_tree *tree,
- struct page *page, gfp_t mask)
+ struct folio *folio, gfp_t mask)
{
- u64 start = page_offset(page);
+ u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1;
bool ret;
@@ -2473,11 +2355,11 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
-bool try_release_extent_mapping(struct page *page, gfp_t mask)
+bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
- u64 start = page_offset(page);
+ u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1;
- struct btrfs_inode *inode = page_to_inode(page);
+ struct btrfs_inode *inode = folio_to_inode(folio);
struct extent_io_tree *io_tree = &inode->io_tree;
while (start <= end) {
@@ -2546,7 +2428,7 @@ next:
cond_resched();
}
}
- return try_release_extent_state(io_tree, page, mask);
+ return try_release_extent_state(io_tree, folio, mask);
}
static void __free_extent_buffer(struct extent_buffer *eb)
@@ -2572,7 +2454,7 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli
return true;
/*
* Even there is no eb refs here, we may still have
- * end_page_read() call relying on page::private.
+ * end_folio_read() call relying on page::private.
*/
if (atomic_read(&subpage->readers))
return true;
@@ -3615,7 +3497,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
ASSERT(ret);
}
}
- btrfs_submit_bio(bbio, mirror_num);
+ btrfs_submit_bbio(bbio, mirror_num);
done:
if (wait == WAIT_COMPLETE) {
@@ -4171,17 +4053,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
- const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+ const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
{
struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
- u64 page_start = page_offset(page);
- u64 cur = page_start;
+ u64 folio_start = folio_pos(folio);
+ u64 cur = folio_start;
- ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+ ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
lockdep_assert_held(&fs_info->buffer_lock);
- while (cur < page_start + PAGE_SIZE) {
+ while (cur < folio_start + PAGE_SIZE) {
int ret;
int i;
@@ -4193,7 +4075,7 @@ static struct extent_buffer *get_next_extent_buffer(
goto out;
for (i = 0; i < ret; i++) {
/* Already beyond page end */
- if (gang[i]->start >= page_start + PAGE_SIZE)
+ if (gang[i]->start >= folio_start + PAGE_SIZE)
goto out;
/* Found one */
if (gang[i]->start >= bytenr) {
@@ -4207,11 +4089,11 @@ out:
return found;
}
-static int try_release_subpage_extent_buffer(struct page *page)
+static int try_release_subpage_extent_buffer(struct folio *folio)
{
- struct btrfs_fs_info *fs_info = page_to_fs_info(page);
- u64 cur = page_offset(page);
- const u64 end = page_offset(page) + PAGE_SIZE;
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ u64 cur = folio_pos(folio);
+ const u64 end = cur + PAGE_SIZE;
int ret;
while (cur < end) {
@@ -4226,7 +4108,7 @@ static int try_release_subpage_extent_buffer(struct page *page)
* with spinlock rather than RCU.
*/
spin_lock(&fs_info->buffer_lock);
- eb = get_next_extent_buffer(fs_info, page, cur);
+ eb = get_next_extent_buffer(fs_info, folio, cur);
if (!eb) {
/* No more eb in the page range after or at cur */
spin_unlock(&fs_info->buffer_lock);
@@ -4267,31 +4149,30 @@ static int try_release_subpage_extent_buffer(struct page *page)
* Finally to check if we have cleared folio private, as if we have
* released all ebs in the page, the folio private should be cleared now.
*/
- spin_lock(&page->mapping->i_private_lock);
- if (!folio_test_private(page_folio(page)))
+ spin_lock(&folio->mapping->i_private_lock);
+ if (!folio_test_private(folio))
ret = 1;
else
ret = 0;
- spin_unlock(&page->mapping->i_private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return ret;
}
-int try_release_extent_buffer(struct page *page)
+int try_release_extent_buffer(struct folio *folio)
{
- struct folio *folio = page_folio(page);
struct extent_buffer *eb;
- if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
- return try_release_subpage_extent_buffer(page);
+ if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ return try_release_subpage_extent_buffer(folio);
/*
* We need to make sure nobody is changing folio private, as we rely on
* folio private as the pointer to extent buffer.
*/
- spin_lock(&page->mapping->i_private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
if (!folio_test_private(folio)) {
- spin_unlock(&page->mapping->i_private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return 1;
}
@@ -4306,10 +4187,10 @@ int try_release_extent_buffer(struct page *page)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&page->mapping->i_private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return 0;
}
- spin_unlock(&page->mapping->i_private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index dceebd76c7d1..8a36117ed453 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -236,11 +236,11 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
-bool try_release_extent_mapping(struct page *page, gfp_t mask);
-int try_release_extent_buffer(struct page *page);
+bool try_release_extent_mapping(struct folio *folio, gfp_t mask);
+int try_release_extent_buffer(struct folio *folio);
int btrfs_read_folio(struct file *file, struct folio *folio);
-void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty);
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
@@ -249,7 +249,7 @@ int btree_write_cache_pages(struct address_space *mapping,
void btrfs_readahead(struct readahead_control *rac);
int set_folio_extent_mapped(struct folio *folio);
int set_page_extent_mapped(struct page *page);
-void clear_page_extent_mapped(struct page *page);
+void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
@@ -354,7 +354,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- const struct page *locked_page,
+ const struct folio *locked_folio,
struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
@@ -368,7 +368,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
- struct page *locked_page, u64 *start,
+ struct folio *locked_folio, u64 *start,
u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23b65dc73c00..25d191f1ac10 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -192,10 +192,13 @@ static inline u64 extent_map_block_len(const struct extent_map *em)
static inline u64 extent_map_block_end(const struct extent_map *em)
{
- if (extent_map_block_start(em) + extent_map_block_len(em) <
- extent_map_block_start(em))
+ const u64 block_start = extent_map_block_start(em);
+ const u64 block_end = block_start + extent_map_block_len(em);
+
+ if (block_end < block_start)
return (u64)-1;
- return extent_map_block_start(em) + extent_map_block_len(em);
+
+ return block_end;
}
static bool can_merge_extent_map(const struct extent_map *em)
@@ -1147,8 +1150,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
return 0;
/*
- * We want to be fast because we can be called from any path trying to
- * allocate memory, so if the lock is busy we don't want to spend time
+ * We want to be fast so if the lock is busy we don't want to spend time
* waiting for it - either some task is about to do IO for the inode or
* we may have another task shrinking extent maps, here in this code, so
* skip this inode.
@@ -1191,9 +1193,7 @@ next:
/*
* Stop if we need to reschedule or there's contention on the
* lock. This is to avoid slowing other tasks trying to take the
- * lock and because the shrinker might be called during a memory
- * allocation path and we want to avoid taking a very long time
- * and slowing down all sorts of tasks.
+ * lock.
*/
if (need_resched() || rwlock_needbreak(&tree->lock))
break;
@@ -1222,12 +1222,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
if (ctx->scanned >= ctx->nr_to_scan)
break;
- /*
- * We may be called from memory allocation paths, so we don't
- * want to take too much time and slowdown tasks.
- */
- if (need_resched())
- break;
+ cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
}
@@ -1285,14 +1280,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
ctx.last_ino);
}
- /*
- * We may be called from memory allocation paths, so we don't want to
- * take too much time and slowdown tasks, so stop if we need reschedule.
- */
- while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
+ while (ctx.scanned < ctx.nr_to_scan) {
struct btrfs_root *root;
unsigned long count;
+ cond_resched();
+
spin_lock(&fs_info->fs_roots_radix_lock);
count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)&root,
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 8f95f3e44e99..df7f09f3b02e 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -637,7 +637,7 @@ static int extent_fiemap(struct btrfs_inode *inode,
struct btrfs_path *path;
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
- u64 last_extent_end;
+ u64 last_extent_end = 0;
u64 prev_extent_end;
u64 range_start;
u64 range_end;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5c342fe1af61..886749b39672 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -151,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
* Calculate the total size needed to allocate for an ordered sum structure
* spanning @bytes in the file.
*/
-static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes)
{
return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
}
@@ -1272,7 +1272,7 @@ out:
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
+ const struct btrfs_file_extent_item *fi,
struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 557dc43d7142..0e13661a71f3 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -74,7 +74,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
unsigned long *csum_bitmap);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
- struct btrfs_file_extent_item *fi,
+ const struct btrfs_file_extent_item *fi,
struct extent_map *em);
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9f10a9f23fcc..c5e36f58eb07 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1603,7 +1603,6 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_root *root = inode->root;
@@ -1613,7 +1612,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
- const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
+ bool skip_ilock = false;
+
+ if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) {
+ skip_ilock = true;
+ current->journal_info = NULL;
+ btrfs_assert_inode_locked(inode);
+ }
trace_btrfs_sync_file(file, datasync);
@@ -1868,7 +1873,10 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&inode->i_mmap_lock);
+ else
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -1912,8 +1920,8 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
- page_start = page_offset(page);
- page_end = page_start + PAGE_SIZE - 1;
+ page_start = folio_pos(folio);
+ page_end = page_start + folio_size(folio) - 1;
end = page_end;
/*
@@ -1941,18 +1949,18 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
again:
down_read(&BTRFS_I(inode)->i_mmap_lock);
- lock_page(page);
+ folio_lock(folio);
size = i_size_read(inode);
- if ((page->mapping != inode->i_mapping) ||
+ if ((folio->mapping != inode->i_mapping) ||
(page_start >= size)) {
/* Page got truncated out from underneath us. */
goto out_unlock;
}
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_page_extent_mapped(page);
+ ret2 = set_folio_extent_mapped(folio);
if (ret2 < 0) {
ret = vmf_error(ret2);
unlock_extent(io_tree, page_start, page_end, &cached_state);
@@ -1966,14 +1974,14 @@ again:
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
if (ordered) {
unlock_extent(io_tree, page_start, page_end, &cached_state);
- unlock_page(page);
+ folio_unlock(folio);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
- if (page->index == ((size - 1) >> PAGE_SHIFT)) {
+ if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
@@ -2003,13 +2011,13 @@ again:
}
/* Page is wholly or partially inside EOF. */
- if (page_start + PAGE_SIZE > size)
- zero_start = offset_in_page(size);
+ if (page_start + folio_size(folio) > size)
+ zero_start = offset_in_folio(folio, size);
else
zero_start = PAGE_SIZE;
if (zero_start != PAGE_SIZE)
- memzero_page(page, zero_start, PAGE_SIZE - zero_start);
+ folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
@@ -2026,7 +2034,7 @@ again:
return VM_FAULT_LOCKED;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f5996a43db24..eaa1dbd31352 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2697,15 +2697,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
int bg_reclaim_threshold = 0;
- bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
+ bool initial;
u64 reclaimable_unusable;
- WARN_ON(!initial && offset + size > block_group->zone_capacity);
+ spin_lock(&block_group->lock);
+ initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
if (!initial)
bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
- spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
else if (initial)
@@ -2718,7 +2719,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
to_free = offset + size - block_group->alloc_offset;
to_unusable = size - to_free;
+ spin_lock(&ctl->tree_lock);
ctl->free_space += to_free;
+ spin_unlock(&ctl->tree_lock);
/*
* If the block group is read-only, we should account freed space into
* bytes_readonly.
@@ -2727,11 +2730,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
block_group->zone_unusable += to_unusable;
WARN_ON(block_group->zone_unusable > block_group->length);
}
- spin_unlock(&ctl->tree_lock);
if (!used) {
- spin_lock(&block_group->lock);
block_group->alloc_offset -= size;
- spin_unlock(&block_group->lock);
}
reclaimable_unusable = block_group->zone_unusable -
@@ -2745,6 +2745,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
btrfs_mark_bg_to_reclaim(block_group);
}
+ spin_unlock(&block_group->lock);
+
return 0;
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 3d6d4b503220..79f64e383edd 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -703,8 +703,8 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
+ u32 sectors_per_page;
struct workqueue_struct *scrub_workers;
- struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 316756ff08ac..29572dfaf878 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -14,7 +14,7 @@
#include "extent-tree.h"
#include "file-item.h"
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
const struct fscrypt_str *name)
{
@@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
}
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct fscrypt_str *name)
{
struct btrfs_inode_extref *extref;
@@ -423,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
return ret;
}
-static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
- struct extent_buffer *leaf,
- struct btrfs_file_extent_item *fi,
+static inline void btrfs_trace_truncate(const struct btrfs_inode *inode,
+ const struct extent_buffer *leaf,
+ const struct btrfs_file_extent_item *fi,
u64 offset, int extent_type, int slot)
{
if (!inode)
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index c4aded82709b..c11b97fdccc4 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -109,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref(
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
-struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
+struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
const struct fscrypt_str *name);
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
- struct extent_buffer *leaf, int slot, u64 ref_objectid,
+ const struct extent_buffer *leaf, int slot, u64 ref_objectid,
const struct fscrypt_str *name);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 19d05a4c5c33..edac499fd83d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -116,7 +116,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty);
@@ -393,17 +393,17 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
- struct page *locked_page,
+ struct folio *locked_folio,
u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
u64 page_start = 0, page_end = 0;
- struct page *page;
+ struct folio *folio;
- if (locked_page) {
- page_start = page_offset(locked_page);
- page_end = page_start + PAGE_SIZE - 1;
+ if (locked_folio) {
+ page_start = folio_pos(locked_folio);
+ page_end = page_start + folio_size(locked_folio) - 1;
}
while (index <= end_index) {
@@ -417,13 +417,13 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
- if (locked_page && index == (page_start >> PAGE_SHIFT)) {
+ if (locked_folio && index == (page_start >> PAGE_SHIFT)) {
index++;
continue;
}
- page = find_get_page(inode->vfs_inode.i_mapping, index);
+ folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
index++;
- if (!page)
+ if (IS_ERR(folio))
continue;
/*
@@ -431,14 +431,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
- page_folio(page), offset, bytes);
- put_page(page);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
+ offset, bytes);
+ folio_put(folio);
}
- if (locked_page) {
+ if (locked_folio) {
/* The locked page covers the full range, nothing needs to be done */
- if (bytes + offset <= page_start + PAGE_SIZE)
+ if (bytes + offset <= page_start + folio_size(locked_folio))
return;
/*
* In case this page belongs to the delalloc range being
@@ -447,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
- offset = page_offset(locked_page) + PAGE_SIZE;
+ bytes = offset + bytes - folio_pos(locked_folio) -
+ folio_size(locked_folio);
+ offset = folio_pos(locked_folio) + folio_size(locked_folio);
}
}
@@ -494,7 +495,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
- struct page *page = NULL;
const u32 sectorsize = trans->fs_info->sectorsize;
char *kaddr;
unsigned long ptr;
@@ -554,12 +554,16 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->vfs_inode.i_mapping, 0);
+ struct folio *folio;
+
+ folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
+ 0, 0, 0);
+ ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0);
- kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_local(kaddr);
- put_page(page);
+ folio_put(folio);
}
btrfs_mark_buffer_dirty(trans, leaf);
btrfs_release_path(path);
@@ -715,7 +719,7 @@ out:
}
static noinline int cow_file_range_inline(struct btrfs_inode *inode,
- struct page *locked_page,
+ struct folio *locked_folio,
u64 offset, u64 end,
size_t compressed_size,
int compress_type,
@@ -740,13 +744,26 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
return ret;
}
+ /*
+ * In the successful case (ret == 0 here), cow_file_range will return 1.
+ *
+ * Quite a bit further up the callstack in extent_writepage(), ret == 1
+ * is treated as a short circuited success and does not unlock the folio,
+ * so we must do it here.
+ *
+ * In the failure case, the locked_folio does get unlocked by
+ * btrfs_folio_end_all_writers, which asserts that it is still locked
+ * at that point, so we must *not* unlock it here.
+ *
+ * The other two callsites in compress_file_range do not have a
+ * locked_folio, so they are not relevant to this logic.
+ */
if (ret == 0)
- locked_page = NULL;
+ locked_folio = NULL;
- extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached,
- clear_flags,
- PAGE_UNLOCK | PAGE_START_WRITEBACK |
- PAGE_END_WRITEBACK);
+ extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
+ clear_flags, PAGE_UNLOCK |
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
return ret;
}
@@ -762,7 +779,7 @@ struct async_extent {
struct async_chunk {
struct btrfs_inode *inode;
- struct page *locked_page;
+ struct folio *locked_folio;
u64 start;
u64 end;
blk_opf_t write_flags;
@@ -868,25 +885,25 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode, small_write);
+ btrfs_add_inode_defrag(inode, small_write);
}
static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
int ret = 0;
for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) {
- page = find_get_page(inode->i_mapping, index);
- if (unlikely(!page)) {
+ folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
+ if (IS_ERR(folio)) {
if (!ret)
- ret = -ENOENT;
+ ret = PTR_ERR(folio);
continue;
}
- clear_page_dirty_for_io(page);
- put_page(page);
+ folio_clear_dirty_for_io(folio);
+ folio_put(folio);
}
return ret;
}
@@ -1122,7 +1139,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
static void submit_uncompressed_range(struct btrfs_inode *inode,
struct async_extent *async_extent,
- struct page *locked_page)
+ struct folio *locked_folio)
{
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1135,20 +1152,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
};
wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
- ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
+ ret = run_delalloc_cow(inode, locked_folio, start, end,
+ &wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
- btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
- if (locked_page) {
- const u64 page_start = page_offset(locked_page);
-
- set_page_writeback(locked_page);
- end_page_writeback(locked_page);
- btrfs_mark_ordered_io_finished(inode, locked_page,
+ btrfs_cleanup_ordered_extents(inode, locked_folio,
+ start, end - start + 1);
+ if (locked_folio) {
+ const u64 page_start = folio_pos(locked_folio);
+
+ folio_start_writeback(locked_folio);
+ folio_end_writeback(locked_folio);
+ btrfs_mark_ordered_io_finished(inode, locked_folio,
page_start, PAGE_SIZE,
!ret);
- mapping_set_error(locked_page->mapping, ret);
- unlock_page(locked_page);
+ mapping_set_error(locked_folio->mapping, ret);
+ folio_unlock(locked_folio);
}
}
}
@@ -1164,7 +1183,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent;
struct btrfs_key ins;
- struct page *locked_page = NULL;
+ struct folio *locked_folio = NULL;
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
@@ -1175,19 +1194,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
kthread_associate_blkcg(async_chunk->blkcg_css);
/*
- * If async_chunk->locked_page is in the async_extent range, we need to
+ * If async_chunk->locked_folio is in the async_extent range, we need to
* handle it.
*/
- if (async_chunk->locked_page) {
- u64 locked_page_start = page_offset(async_chunk->locked_page);
- u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
+ if (async_chunk->locked_folio) {
+ u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
+ u64 locked_folio_end = locked_folio_start +
+ folio_size(async_chunk->locked_folio) - 1;
- if (!(start >= locked_page_end || end <= locked_page_start))
- locked_page = async_chunk->locked_page;
+ if (!(start >= locked_folio_end || end <= locked_folio_start))
+ locked_folio = async_chunk->locked_folio;
}
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
- submit_uncompressed_range(inode, async_extent, locked_page);
+ submit_uncompressed_range(inode, async_extent, locked_folio);
goto done;
}
@@ -1202,7 +1222,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* non-contiguous space for the uncompressed size instead. So
* fall back to uncompressed.
*/
- submit_uncompressed_range(inode, async_extent, locked_page);
+ submit_uncompressed_range(inode, async_extent, locked_folio);
goto done;
}
@@ -1306,21 +1326,21 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* allocate extents on disk for the range, and create ordered data structs
* in ram to track those extents.
*
- * locked_page is the page that writepage had locked already. We use
+ * locked_folio is the folio that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
- * When this function fails, it unlocks all pages except @locked_page.
+ * When this function fails, it unlocks all pages except @locked_folio.
*
* When this function successfully creates an inline extent, it returns 1 and
- * unlocks all pages including locked_page and starts I/O on them.
- * (In reality inline extents are limited to a single page, so locked_page is
+ * unlocks all pages including locked_folio and starts I/O on them.
+ * (In reality inline extents are limited to a single page, so locked_folio is
* the only page handled anyway).
*
* When this function succeed and creates a normal extent, the page locking
* status depends on the passed in flags:
*
* - If @keep_locked is set, all pages are kept locked.
- * - Else all pages except for @locked_page are unlocked.
+ * - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
* while-loop, the ordered extents created in previous iterations are kept
@@ -1329,8 +1349,8 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* example.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
- struct page *locked_page, u64 start, u64 end,
- u64 *done_offset,
+ struct folio *locked_folio, u64 start,
+ u64 end, u64 *done_offset,
bool keep_locked, bool no_inline)
{
struct btrfs_root *root = inode->root;
@@ -1363,7 +1383,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (!no_inline) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, locked_page, start, end, 0,
+ ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
BTRFS_COMPRESS_NONE, NULL, false);
if (ret <= 0) {
/*
@@ -1500,7 +1520,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
- locked_page, &cached,
+ locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
@@ -1553,13 +1573,13 @@ out_unlock:
* function.
*
* However, in case of @keep_locked, we still need to unlock the pages
- * (except @locked_page) to ensure all the pages are unlocked.
+ * (except @locked_folio) to ensure all the pages are unlocked.
*/
if (keep_locked && orig_start < start) {
- if (!locked_page)
+ if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
- locked_page, NULL, 0, page_ops);
+ locked_folio, NULL, 0, page_ops);
}
/*
@@ -1582,9 +1602,9 @@ out_unlock:
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
- locked_page, &cached,
- clear_bits,
+ locked_folio, &cached, clear_bits,
page_ops);
+ btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
start += cur_alloc_size;
}
@@ -1596,8 +1616,9 @@ out_unlock:
*/
if (start < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
- extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ extent_clear_unlock_delalloc(inode, start, end, locked_folio,
&cached, clear_bits, page_ops);
+ btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
}
return ret;
}
@@ -1649,7 +1670,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
}
static bool run_delalloc_compressed(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1689,15 +1710,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
INIT_LIST_HEAD(&async_chunk[i].extents);
/*
- * The locked_page comes all the way from writepage and its
- * the original page we were actually given. As we spread
+ * The locked_folio comes all the way from writepage and its
+ * the original folio we were actually given. As we spread
* this large delalloc region across multiple async_chunk
- * structs, only the first struct needs a pointer to locked_page
+ * structs, only the first struct needs a pointer to
+ * locked_folio.
*
* This way we don't need racey decisions about who is supposed
* to unlock it.
*/
- if (locked_page) {
+ if (locked_folio) {
/*
* Depending on the compressibility, the pages might or
* might not go through async. We want all of them to
@@ -1707,12 +1729,12 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
- wbc_account_cgroup_owner(wbc, locked_page,
+ wbc_account_cgroup_owner(wbc, &locked_folio->page,
cur_end - start);
- async_chunk[i].locked_page = locked_page;
- locked_page = NULL;
+ async_chunk[i].locked_folio = locked_folio;
+ locked_folio = NULL;
} else {
- async_chunk[i].locked_page = NULL;
+ async_chunk[i].locked_folio = NULL;
}
if (blkcg_css != blkcg_root_css) {
@@ -1741,7 +1763,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* covered by the range.
*/
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
- struct page *locked_page, u64 start,
+ struct folio *locked_folio, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
@@ -1749,20 +1771,21 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
int ret;
while (start <= end) {
- ret = cow_file_range(inode, locked_page, start, end, &done_offset,
- true, false);
+ ret = cow_file_range(inode, locked_folio, start, end,
+ &done_offset, true, false);
if (ret)
return ret;
- extent_write_locked_range(&inode->vfs_inode, locked_page, start,
- done_offset, wbc, pages_dirty);
+ extent_write_locked_range(&inode->vfs_inode, locked_folio,
+ start, done_offset, wbc, pages_dirty);
start = done_offset + 1;
}
return 1;
}
-static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
- const u64 start, const u64 end)
+static int fallback_to_cow(struct btrfs_inode *inode,
+ struct folio *locked_folio, const u64 start,
+ const u64 end)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
@@ -1831,7 +1854,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
* is written out and unlocked directly and a normal NOCOW extent
* doesn't work.
*/
- ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
+ ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
+ true);
ASSERT(ret != 1);
return ret;
}
@@ -1985,7 +2009,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* blocks on disk
*/
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
- struct page *locked_page,
+ struct folio *locked_folio,
const u64 start, const u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -2148,8 +2172,8 @@ must_cow:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = fallback_to_cow(inode, locked_page,
- cow_start, found_key.offset - 1);
+ ret = fallback_to_cow(inode, locked_folio, cow_start,
+ found_key.offset - 1);
cow_start = (u64)-1;
if (ret) {
btrfs_dec_nocow_writers(nocow_bg);
@@ -2204,7 +2228,7 @@ must_cow:
btrfs_put_ordered_extent(ordered);
extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_page, &cached_state,
+ locked_folio, &cached_state,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED);
@@ -2226,7 +2250,7 @@ must_cow:
if (cow_start != (u64)-1) {
cur_offset = end;
- ret = fallback_to_cow(inode, locked_page, cow_start, end);
+ ret = fallback_to_cow(inode, locked_folio, cow_start, end);
cow_start = (u64)-1;
if (ret)
goto error;
@@ -2253,12 +2277,13 @@ error:
lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
- locked_page, &cached,
+ locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
+ btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
}
btrfs_free_path(path);
return ret;
@@ -2279,39 +2304,39 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
-int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
+int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
u64 start, u64 end, struct writeback_control *wbc)
{
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
int ret;
/*
- * The range must cover part of the @locked_page, or a return of 1
+ * The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
- ASSERT(!(end <= page_offset(locked_page) ||
- start >= page_offset(locked_page) + PAGE_SIZE));
+ ASSERT(!(end <= folio_pos(locked_folio) ||
+ start >= folio_pos(locked_folio) + folio_size(locked_folio)));
if (should_nocow(inode, start, end)) {
- ret = run_delalloc_nocow(inode, locked_page, start, end);
+ ret = run_delalloc_nocow(inode, locked_folio, start, end);
goto out;
}
if (btrfs_inode_can_compress(inode) &&
inode_need_compress(inode, start, end) &&
- run_delalloc_compressed(inode, locked_page, start, end, wbc))
+ run_delalloc_compressed(inode, locked_folio, start, end, wbc))
return 1;
if (zoned)
- ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
+ ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
true);
else
- ret = cow_file_range(inode, locked_page, start, end, NULL,
+ ret = cow_file_range(inode, locked_folio, start, end, NULL,
false, false);
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ btrfs_cleanup_ordered_extents(inode, locked_folio, start,
end - start + 1);
return ret;
}
@@ -2687,7 +2712,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
- struct page *page;
+ struct folio *folio;
struct btrfs_inode *inode;
struct btrfs_work work;
};
@@ -2699,50 +2724,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- struct page *page = fixup->page;
+ struct folio *folio = fixup->folio;
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 page_start = page_offset(page);
- u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
/*
* This is similar to page_mkwrite, we need to reserve the space before
- * we take the page lock.
+ * we take the folio lock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- PAGE_SIZE);
+ folio_size(folio));
again:
- lock_page(page);
+ folio_lock(folio);
/*
- * Before we queued this fixup, we took a reference on the page.
- * page->mapping may go NULL, but it shouldn't be moved to a different
+ * Before we queued this fixup, we took a reference on the folio.
+ * folio->mapping may go NULL, but it shouldn't be moved to a different
* address space.
*/
- if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+ if (!folio->mapping || !folio_test_dirty(folio) ||
+ !folio_test_checked(folio)) {
/*
* Unfortunately this is a little tricky, either
*
- * 1) We got here and our page had already been dealt with and
+ * 1) We got here and our folio had already been dealt with and
* we reserved our space, thus ret == 0, so we need to just
* drop our space reservation and bail. This can happen the
* first time we come into the fixup worker, or could happen
* while waiting for the ordered extent.
- * 2) Our page was already dealt with, but we happened to get an
+ * 2) Our folio was already dealt with, but we happened to get an
* ENOSPC above from the btrfs_delalloc_reserve_space. In
* this case we obviously don't have anything to release, but
- * because the page was already dealt with we don't want to
- * mark the page with an error, so make sure we're resetting
+ * because the folio was already dealt with we don't want to
+ * mark the folio with an error, so make sure we're resetting
* ret to 0. This is why we have this check _before_ the ret
* check, because we do not want to have a surprise ENOSPC
- * when the page was already properly dealt with.
+ * when the folio was already properly dealt with.
*/
if (!ret) {
- btrfs_delalloc_release_extents(inode, PAGE_SIZE);
+ btrfs_delalloc_release_extents(inode, folio_size(folio));
btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE,
+ page_start, folio_size(folio),
true);
}
ret = 0;
@@ -2750,7 +2776,7 @@ again:
}
/*
- * We can't mess with the page state unless it is locked, so now that
+ * We can't mess with the folio state unless it is locked, so now that
* it is locked bail if we failed to make our space reservation.
*/
if (ret)
@@ -2759,14 +2785,14 @@ again:
lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PageOrdered(page))
+ if (folio_test_ordered(folio))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
unlock_extent(&inode->io_tree, page_start, page_end,
&cached_state);
- unlock_page(page);
+ folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -2784,7 +2810,7 @@ again:
*
* The page was dirty when we started, nothing should have cleaned it.
*/
- BUG_ON(!PageDirty(page));
+ BUG_ON(!folio_test_dirty(folio));
free_delalloc_space = false;
out_reserved:
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
@@ -2798,14 +2824,14 @@ out_page:
* We hit ENOSPC or other errors. Update the mapping and page
* to reflect the errors and clean the page.
*/
- mapping_set_error(page->mapping, ret);
- btrfs_mark_ordered_io_finished(inode, page, page_start,
- PAGE_SIZE, !ret);
- clear_page_dirty_for_io(page);
- }
- btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
- unlock_page(page);
- put_page(page);
+ mapping_set_error(folio->mapping, ret);
+ btrfs_mark_ordered_io_finished(inode, folio, page_start,
+ folio_size(folio), !ret);
+ folio_clear_dirty_for_io(folio);
+ }
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ folio_unlock(folio);
+ folio_put(folio);
kfree(fixup);
extent_changeset_free(data_reserved);
/*
@@ -2818,33 +2844,34 @@ out_page:
/*
* There are a few paths in the higher layers of the kernel that directly
- * set the page dirty bit without asking the filesystem if it is a
+ * set the folio dirty bit without asking the filesystem if it is a
* good idea. This causes problems because we want to make sure COW
* properly happens and the data=ordered rules are followed.
*
* In our case any range that doesn't have the ORDERED bit set
* hasn't been properly setup for IO. We kick off an async process
* to fix it up. The async helper will wait for ordered extents, set
- * the delalloc bit and make it safe to write the page.
+ * the delalloc bit and make it safe to write the folio.
*/
-int btrfs_writepage_cow_fixup(struct page *page)
+int btrfs_writepage_cow_fixup(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_writepage_fixup *fixup;
- /* This page has ordered extent covering it already */
- if (PageOrdered(page))
+ /* This folio has ordered extent covering it already */
+ if (folio_test_ordered(folio))
return 0;
/*
- * PageChecked is set below when we create a fixup worker for this page,
- * don't try to create another one if we're already PageChecked()
+ * folio_checked is set below when we create a fixup worker for this
+ * folio, don't try to create another one if we're already
+ * folio_test_checked.
*
- * The extent_io writepage code will redirty the page if we send back
+ * The extent_io writepage code will redirty the foio if we send back
* EAGAIN.
*/
- if (PageChecked(page))
+ if (folio_test_checked(folio))
return -EAGAIN;
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
@@ -2854,14 +2881,14 @@ int btrfs_writepage_cow_fixup(struct page *page)
/*
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
- * takes place outside of the page lock, and we can't trust
- * page->mapping outside of the page lock.
+ * takes place outside of the folio lock, and we can't trust
+ * page->mapping outside of the folio lock.
*/
ihold(inode);
- btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
- get_page(page);
+ btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
+ folio_get(folio);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
- fixup->page = page;
+ fixup->folio = folio;
fixup->inode = BTRFS_I(inode);
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
@@ -4192,6 +4219,7 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
+ inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
ret = btrfs_update_inode(trans, dir);
@@ -6696,7 +6724,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct page *page,
+ struct folio *folio,
struct btrfs_file_extent_item *item)
{
int ret;
@@ -6718,7 +6746,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
- ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
+ ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
+ max_size);
/*
* decompression code contains a memset to fill in any space between the end
@@ -6729,36 +6758,36 @@ static noinline int uncompress_inline(struct btrfs_path *path,
*/
if (max_size < PAGE_SIZE)
- memzero_page(page, max_size, PAGE_SIZE - max_size);
+ folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
kfree(tmp);
return ret;
}
static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
- struct page *page)
+ struct folio *folio)
{
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
- if (!page || PageUptodate(page))
+ if (!folio || folio_test_uptodate(folio))
return 0;
- ASSERT(page_offset(page) == 0);
+ ASSERT(folio_pos(folio) == 0);
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
- return uncompress_inline(path, page, fi);
+ return uncompress_inline(path, folio, fi);
copy_size = min_t(u64, PAGE_SIZE,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
- kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
if (copy_size < PAGE_SIZE)
- memzero_page(page, copy_size, PAGE_SIZE - copy_size);
+ folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
return 0;
}
@@ -6780,7 +6809,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, u64 start, u64 len)
+ struct folio *folio, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
@@ -6803,7 +6832,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (em) {
if (em->start > start || em->start + em->len <= start)
free_extent_map(em);
- else if (em->disk_bytenr == EXTENT_MAP_INLINE && page)
+ else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
free_extent_map(em);
else
goto out;
@@ -6933,7 +6962,7 @@ next:
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
- ret = read_inline_extent(inode, path, page);
+ ret = read_inline_extent(inode, path, folio);
if (ret < 0)
goto out;
goto insert;
@@ -7175,13 +7204,12 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
*/
-static void wait_subpage_spinlock(struct page *page)
+static void wait_subpage_spinlock(struct folio *folio)
{
- struct btrfs_fs_info *fs_info = page_to_fs_info(page);
- struct folio *folio = page_folio(page);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page->mapping))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
@@ -7202,11 +7230,17 @@ static void wait_subpage_spinlock(struct page *page)
spin_unlock_irq(&subpage->lock);
}
+static int btrfs_launder_folio(struct folio *folio)
+{
+ return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
+ PAGE_SIZE, NULL);
+}
+
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
- if (try_release_extent_mapping(&folio->page, gfp_flags)) {
- wait_subpage_spinlock(&folio->page);
- clear_page_extent_mapped(&folio->page);
+ if (try_release_extent_mapping(folio, gfp_flags)) {
+ wait_subpage_spinlock(folio);
+ clear_folio_extent_mapped(folio);
return true;
}
return false;
@@ -7266,7 +7300,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* do double ordered extent accounting on the same folio.
*/
folio_wait_writeback(folio);
- wait_subpage_spinlock(&folio->page);
+ wait_subpage_spinlock(folio);
/*
* For subpage case, we have call sites like
@@ -7404,7 +7438,7 @@ next:
btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
- clear_page_extent_mapped(&folio->page);
+ clear_folio_extent_mapped(folio);
}
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
@@ -8941,19 +8975,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
u32 len;
ASSERT(end + 1 - start <= U32_MAX);
len = end + 1 - start;
while (index <= end_index) {
- page = find_get_page(inode->vfs_inode.i_mapping, index);
- ASSERT(page); /* Pages should be in the extent_io_tree */
+ folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
+ ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
/* This is for data, which doesn't yet support larger folio. */
- ASSERT(folio_order(page_folio(page)) == 0);
- btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
- put_page(page);
+ ASSERT(folio_order(folio) == 0);
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ folio_put(folio);
index++;
}
}
@@ -9118,7 +9152,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
atomic_inc(&priv.pending);
- btrfs_submit_bio(bbio, 0);
+ btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
btrfs_encoded_read_endio, &priv);
@@ -9133,7 +9167,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
} while (disk_io_size);
atomic_inc(&priv.pending);
- btrfs_submit_bio(bbio, 0);
+ btrfs_submit_bbio(bbio, 0);
if (atomic_dec_return(&priv.pending))
io_wait_event(priv.wait, !atomic_read(&priv.pending));
@@ -10137,6 +10171,7 @@ static const struct address_space_operations btrfs_aops = {
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.invalidate_folio = btrfs_invalidate_folio,
+ .launder_folio = btrfs_launder_folio,
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e0a664b8a46a..8537eb9b5531 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -543,13 +543,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
- if (ret < 0)
- return ret;
if (copy_to_user(arg, &range, sizeof(range)))
return -EFAULT;
- return 0;
+ return ret;
}
int __pure btrfs_is_empty_uuid(const u8 *uuid)
@@ -4765,11 +4763,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
/*
- * The transaction thread may want to do more work,
- * namely it pokes the cleaner kthread that will start
- * processing uncleaned subvols.
+ * There may be work for the cleaner kthread to do (subvolume
+ * deletion, delayed iputs, defrag inodes, etc), so wake it up.
*/
- wake_up_process(fs_info->transaction_kthread);
+ wake_up_process(fs_info->cleaner_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1e2a68b8f62d..72856f6775f7 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -438,11 +438,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page);
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
@@ -467,22 +467,22 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (unlikely(ret != LZO_E_OK)) {
- struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(fs_info,
"lzo decompression failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
- page_offset(dest_page));
+ folio_pos(dest_folio));
ret = -EIO;
goto out;
}
ASSERT(out_len <= sectorsize);
- memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+ memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len);
/* Early end, considered as an error. */
if (unlikely(out_len < destlen)) {
ret = -EIO;
- memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
+ folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len);
}
out:
return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 82a68394a89c..2104d60c2161 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -332,7 +332,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
}
static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
- struct page *page, u64 file_offset,
+ struct folio *folio, u64 file_offset,
u64 len, bool uptodate)
{
struct btrfs_inode *inode = ordered->inode;
@@ -340,10 +340,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
lockdep_assert_held(&inode->ordered_tree_lock);
- if (page) {
- ASSERT(page->mapping);
- ASSERT(page_offset(page) <= file_offset);
- ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);
+ if (folio) {
+ ASSERT(folio->mapping);
+ ASSERT(folio_pos(folio) <= file_offset);
+ ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
/*
* Ordered (Private2) bit indicates whether we still have
@@ -351,10 +351,9 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
*
* If there's no such bit, we need to skip to next range.
*/
- if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
- file_offset, len))
+ if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
return false;
- btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
+ btrfs_folio_clear_ordered(fs_info, folio, file_offset, len);
}
/* Now we're fine to update the accounting. */
@@ -398,7 +397,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
}
void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
- struct page *page, u64 file_offset, u64 len,
+ struct folio *folio, u64 file_offset, u64 len,
bool uptodate)
{
struct btrfs_inode *inode = ordered->inode;
@@ -408,7 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
spin_lock_irqsave(&inode->ordered_tree_lock, flags);
- ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
+ ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
+ uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
/*
@@ -449,8 +449,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
/*
* Mark all ordered extents io inside the specified range finished.
*
- * @page: The involved page for the operation.
- * For uncompressed buffered IO, the page status also needs to be
+ * @folio: The involved folio for the operation.
+ * For uncompressed buffered IO, the folio status also needs to be
* updated to indicate whether the pending ordered io is finished.
* Can be NULL for direct IO and compressed write.
* For these cases, callers are ensured they won't execute the
@@ -460,7 +460,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
* extent(s) covering it.
*/
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- struct page *page, u64 file_offset,
+ struct folio *folio, u64 file_offset,
u64 num_bytes, bool uptodate)
{
struct rb_node *node;
@@ -524,7 +524,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
ASSERT(end + 1 - cur < U32_MAX);
len = end + 1 - cur;
- if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
+ if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
btrfs_queue_ordered_fn(entry);
spin_lock_irqsave(&inode->ordered_tree_lock, flags);
@@ -1015,7 +1015,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
{
struct rb_node *n;
- ASSERT(inode_is_locked(&inode->vfs_inode));
+ btrfs_assert_inode_locked(inode);
spin_lock_irq(&inode->ordered_tree_lock);
for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 51b9e81726e2..4e152736d06c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -163,11 +163,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
- struct page *page, u64 file_offset, u64 len,
+ struct folio *folio, u64 file_offset, u64 len,
bool uptodate);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
- struct page *page, u64 file_offset,
- u64 num_bytes, bool uptodate);
+ struct folio *folio, u64 file_offset,
+ u64 num_bytes, bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 6195a2215b8f..9f3ad124104f 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -9,9 +9,8 @@
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
- int ret = 0;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -21,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-
- btrfs_free_path(path);
- return ret;
+ return btrfs_insert_empty_item(trans, root, path, &key, 0);
}
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -44,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
- if (ret) { /* JDM: Really? */
- ret = -ENOENT;
- goto out;
- }
-
- ret = btrfs_del_item(trans, root, path);
+ return ret;
+ if (ret)
+ return -ENOENT;
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, root, path);
}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 32dcea662da3..fc821aa446f0 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -14,7 +14,7 @@
struct root_name_map {
u64 id;
- char name[16];
+ const char *name;
};
static const struct root_name_map root_map[] = {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5d57a285d59b..c297909f1506 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1998,16 +1998,14 @@ out:
*
* Return 0 for success insert
* Return >0 for existing record, caller can free @record safely.
- * Error is not possible
+ * Return <0 for insertion failure, caller can free @record safely.
*/
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
{
- struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_qgroup_extent_record *entry;
- u64 bytenr = record->bytenr;
+ struct btrfs_qgroup_extent_record *existing, *ret;
+ unsigned long bytenr = record->bytenr;
if (!btrfs_qgroup_full_accounting(fs_info))
return 1;
@@ -2015,26 +2013,24 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
- node);
- if (bytenr < entry->bytenr) {
- p = &(*p)->rb_left;
- } else if (bytenr > entry->bytenr) {
- p = &(*p)->rb_right;
- } else {
- if (record->data_rsv && !entry->data_rsv) {
- entry->data_rsv = record->data_rsv;
- entry->data_rsv_refroot =
- record->data_rsv_refroot;
- }
- return 1;
+ xa_lock(&delayed_refs->dirty_extents);
+ existing = xa_load(&delayed_refs->dirty_extents, bytenr);
+ if (existing) {
+ if (record->data_rsv && !existing->data_rsv) {
+ existing->data_rsv = record->data_rsv;
+ existing->data_rsv_refroot = record->data_rsv_refroot;
}
+ xa_unlock(&delayed_refs->dirty_extents);
+ return 1;
+ }
+
+ ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC);
+ xa_unlock(&delayed_refs->dirty_extents);
+ if (xa_is_err(ret)) {
+ qgroup_mark_inconsistent(fs_info);
+ return xa_err(ret);
}
- rb_link_node(&record->node, parent_node, p);
- rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
return 0;
}
@@ -2141,6 +2137,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!record)
return -ENOMEM;
+ if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) {
+ kfree(record);
+ return -ENOMEM;
+ }
+
delayed_refs = &trans->transaction->delayed_refs;
record->bytenr = bytenr;
record->num_bytes = num_bytes;
@@ -2149,7 +2150,9 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
spin_lock(&delayed_refs->lock);
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
spin_unlock(&delayed_refs->lock);
- if (ret > 0) {
+ if (ret) {
+ /* Clean up if insertion fails or item exists. */
+ xa_release(&delayed_refs->dirty_extents, record->bytenr);
kfree(record);
return 0;
}
@@ -3018,7 +3021,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
- struct rb_node *node;
+ unsigned long index;
u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
@@ -3028,10 +3031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
- while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
- record = rb_entry(node, struct btrfs_qgroup_extent_record,
- node);
-
+ xa_for_each(&delayed_refs->dirty_extents, index, record) {
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
@@ -3097,7 +3097,7 @@ cleanup:
ulist_free(record->old_roots);
ulist_free(new_roots);
new_roots = NULL;
- rb_erase(node, &delayed_refs->dirty_extent_root);
+ xa_erase(&delayed_refs->dirty_extents, index);
kfree(record);
}
@@ -4185,6 +4185,8 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0;
}
+ btrfs_run_delayed_iputs(root->fs_info);
+ btrfs_wait_on_delayed_iputs(root->fs_info);
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
@@ -4344,10 +4346,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
- extent_changeset_init(&changeset);
return clear_record_extent_bits(&inode->io_tree, start,
start + len - 1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ EXTENT_QGROUP_RESERVED, NULL);
}
/* In release case, we shouldn't have @reserved */
@@ -4873,15 +4874,13 @@ out:
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
{
struct btrfs_qgroup_extent_record *entry;
- struct btrfs_qgroup_extent_record *next;
- struct rb_root *root;
+ unsigned long index;
- root = &trans->delayed_refs.dirty_extent_root;
- rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
+ xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) {
ulist_free(entry->old_roots);
kfree(entry);
}
- *root = RB_ROOT;
+ xa_destroy(&trans->delayed_refs.dirty_extents);
}
void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index deb479d176a9..98adf4ec7b01 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -125,7 +125,6 @@ struct btrfs_inode;
* Record a dirty extent, and info qgroup to update quota on it
*/
struct btrfs_qgroup_extent_record {
- struct rb_node node;
u64 bytenr;
u64 num_bytes;
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index e6f7a234b8f6..4c859b550f6c 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -66,6 +66,11 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
if (ret)
break;
+ start += key.offset;
+ length -= key.offset;
+ if (length == 0)
+ break;
+
btrfs_release_path(path);
}
@@ -73,6 +78,36 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
return ret;
}
+static int update_raid_extent_item(struct btrfs_trans_handle *trans,
+ struct btrfs_key *key,
+ struct btrfs_stripe_extent *stripe_extent,
+ const size_t item_size)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
+ 0, 1);
+ if (ret)
+ return (ret == 1 ? ret : -EINVAL);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
+ item_size);
+ btrfs_mark_buffer_dirty(trans, leaf);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_io_context *bioc)
{
@@ -112,6 +147,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
item_size);
+ if (ret == -EEXIST)
+ ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
+ item_size);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -172,7 +210,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
if (!path)
return -ENOMEM;
- if (stripe->is_scrub) {
+ if (stripe->rst_search_commit_root) {
path->skip_locking = 1;
path->search_commit_root = 1;
}
@@ -245,10 +283,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
out:
if (ret > 0)
ret = -ENOENT;
- if (ret && ret != -EIO && !stripe->is_scrub) {
- if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
- btrfs_print_tree(leaf, 1);
- btrfs_err(fs_info,
+ if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
+ btrfs_debug(fs_info,
"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
logical, logical + *length, stripe->dev->devid,
btrfs_bg_type_to_raid_name(map_type));
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index df6b93b927cd..f0824c948cb7 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -66,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
@@ -83,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret)
goto out;
- page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(mapping));
- if (!page) {
+ folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ btrfs_alloc_write_mask(mapping));
+ if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out_unlock;
}
- ret = set_page_extent_mapped(page);
+ ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto out_unlock;
@@ -115,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
if (comp_type == BTRFS_COMPRESS_NONE) {
- memcpy_to_page(page, offset_in_page(file_offset), data_start,
- datal);
+ memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
+ datal);
} else {
- ret = btrfs_decompress(comp_type, data_start, page,
- offset_in_page(file_offset),
+ ret = btrfs_decompress(comp_type, data_start, folio,
+ offset_in_folio(folio, file_offset),
inline_size, datal);
if (ret)
goto out_unlock;
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
}
/*
@@ -139,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
* So what's in the range [500, 4095] corresponds to zeroes.
*/
if (datal < block_size)
- memzero_page(page, datal, block_size - datal);
+ folio_zero_range(folio, datal, block_size - datal);
- btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size);
- btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size);
- btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
+ btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
+ btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
out_unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (!IS_ERR(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
}
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0533d0f82dc9..ea4ed85919ec 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -36,6 +36,7 @@
#include "relocation.h"
#include "super.h"
#include "tree-checker.h"
+#include "raid-stripe-tree.h"
/*
* Relocation overview
@@ -2965,21 +2966,34 @@ static int relocate_one_folio(struct reloc_control *rc,
u64 folio_end;
u64 cur;
int ret;
+ const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
ASSERT(index <= last_index);
folio = filemap_lock_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
- page_cache_sync_readahead(inode->i_mapping, ra, NULL,
- index, last_index + 1 - index);
+
+ /*
+ * On relocation we're doing readahead on the relocation inode,
+ * but if the filesystem is backed by a RAID stripe tree we can
+ * get ENOENT (e.g. due to preallocated extents not being
+ * mapped in the RST) from the lookup.
+ *
+ * But readahead doesn't handle the error and submits invalid
+ * reads to the device, causing a assertion failures.
+ */
+ if (!use_rst)
+ page_cache_sync_readahead(inode->i_mapping, ra, NULL,
+ index, last_index + 1 - index);
folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mask);
if (IS_ERR(folio))
return PTR_ERR(folio);
}
WARN_ON(folio_order(folio));
- if (folio_test_readahead(folio))
+ if (folio_test_readahead(folio) && !use_rst)
page_cache_async_readahead(inode->i_mapping, ra, NULL,
folio, last_index + 1 - index);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 14a8d7100018..3a3427428074 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -838,7 +838,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
bbio->bio.bi_iter.bi_size >= blocksize)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
if (wait)
wait_scrub_stripe_io(stripe);
bbio = NULL;
@@ -857,7 +857,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
if (bbio) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
if (wait)
wait_scrub_stripe_io(stripe);
}
@@ -1648,14 +1648,20 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
}
}
+static u32 stripe_length(const struct scrub_stripe *stripe)
+{
+ ASSERT(stripe->bg);
+
+ return min(BTRFS_STRIPE_LEN,
+ stripe->bg->start + stripe->bg->length - stripe->logical);
+}
+
static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
struct scrub_stripe *stripe)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
- unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
- stripe->bg->length - stripe->logical) >>
- fs_info->sectorsize_bits;
+ unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
@@ -1677,7 +1683,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
bbio->bio.bi_iter.bi_size >= stripe_len)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
bbio = NULL;
}
@@ -1688,7 +1694,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
(i << fs_info->sectorsize_bits);
int err;
- io_stripe.is_scrub = true;
+ io_stripe.rst_search_commit_root = true;
stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
/*
* For RST cases, we need to manually split the bbio to
@@ -1714,7 +1720,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
if (bbio) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
}
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1729,9 +1735,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
- unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
- stripe->bg->length - stripe->logical) >>
- fs_info->sectorsize_bits;
+ unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
@@ -1772,7 +1776,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
mirror = calc_next_mirror(mirror, num_copies);
}
- btrfs_submit_bio(bbio, mirror);
+ btrfs_submit_bbio(bbio, mirror);
}
static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
@@ -1871,6 +1875,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
stripe = &sctx->stripes[i];
wait_scrub_stripe_io(stripe);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.last_physical = stripe->physical + stripe_length(stripe);
+ spin_unlock(&sctx->stat_lock);
scrub_reset_stripe(stripe);
}
out:
@@ -2139,7 +2146,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
cur_physical, &found_logical);
if (ret > 0) {
/* No more extent, just update the accounting */
+ spin_lock(&sctx->stat_lock);
sctx->stat.last_physical = physical + logical_length;
+ spin_unlock(&sctx->stat_lock);
ret = 0;
break;
}
@@ -2336,6 +2345,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
stripe_logical += chunk_logical;
ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
map, stripe_logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN,
+ physical_end);
+ spin_unlock(&sctx->stat_lock);
if (ret)
goto out;
goto next;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4ca711a773ef..7f48ba6c1c77 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -62,7 +62,7 @@ struct fs_path {
/*
* Average path length does not exceed 200 bytes, we'll have
* better packing in the slab and higher chance to satisfy
- * a allocation later during send.
+ * an allocation later during send.
*/
char pad[256];
};
@@ -347,7 +347,7 @@ struct name_cache_entry {
int ret;
int need_later_update;
int name_len;
- char name[];
+ char name[] __counted_by(name_len);
};
/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
@@ -1136,7 +1136,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
/*
* Start with a small buffer (1 page). If later we end up needing more
* space, which can happen for xattrs on a fs with a leaf size greater
- * then the page size, attempt to increase the buffer. Typically xattr
+ * than the page size, attempt to increase the buffer. Typically xattr
* values are small.
*/
buf_len = PATH_MAX;
@@ -6157,25 +6157,51 @@ static int send_write_or_clone(struct send_ctx *sctx,
u64 offset = key->offset;
u64 end;
u64 bs = sctx->send_root->fs_info->sectorsize;
+ struct btrfs_file_extent_item *ei;
+ u64 disk_byte;
+ u64 data_offset;
+ u64 num_bytes;
+ struct btrfs_inode_info info = { 0 };
end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
if (offset >= end)
return 0;
- if (clone_root && IS_ALIGNED(end, bs)) {
- struct btrfs_file_extent_item *ei;
- u64 disk_byte;
- u64 data_offset;
+ num_bytes = end - offset;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
- data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
- ret = clone_range(sctx, path, clone_root, disk_byte,
- data_offset, offset, end - offset);
- } else {
- ret = send_extent_data(sctx, path, offset, end - offset);
- }
+ if (!clone_root)
+ goto write_data;
+
+ if (IS_ALIGNED(end, bs))
+ goto clone_data;
+
+ /*
+ * If the extent end is not aligned, we can clone if the extent ends at
+ * the i_size of the inode and the clone range ends at the i_size of the
+ * source inode, otherwise the clone operation fails with -EINVAL.
+ */
+ if (end != sctx->cur_inode_size)
+ goto write_data;
+
+ ret = get_inode_info(clone_root->root, clone_root->ino, &info);
+ if (ret < 0)
+ return ret;
+
+ if (clone_root->offset + num_bytes == info.size)
+ goto clone_data;
+
+write_data:
+ ret = send_extent_data(sctx, path, offset, num_bytes);
+ sctx->cur_inode_next_write_offset = end;
+ return ret;
+
+clone_data:
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+ data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+ ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset,
+ num_bytes);
sctx->cur_inode_next_write_offset = end;
return ret;
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 68e14fd48638..d5a9cd8a4fd8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -163,7 +163,7 @@
* thing with or without extra unallocated space.
*/
-u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
bool may_use_included)
{
ASSERT(s_info);
@@ -368,7 +368,7 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
}
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+ const struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
u64 profile;
@@ -437,7 +437,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
}
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
+ const struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
u64 avail;
@@ -542,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
-static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *info)
+static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *info)
{
const char *flag_str = space_info_flag_to_str(info);
lockdep_assert_held(&info->lock);
@@ -844,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info,
return;
}
-static inline u64
-btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *space_info)
{
u64 used;
u64 avail;
@@ -871,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
}
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
+ const struct btrfs_space_info *space_info)
{
const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
@@ -1943,7 +1942,7 @@ static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
* Typically with 10 block groups as the target, the discrete values this comes
* out to are 0, 10, 20, ... , 80, 90, and 99.
*/
-static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
+static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info)
{
struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
@@ -1962,7 +1961,7 @@ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
return calc_pct_ratio(want, target);
}
-int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
+int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info)
{
lockdep_assert_held(&space_info->lock);
@@ -1985,8 +1984,8 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
return unalloc < data_chunk_size;
}
-static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, int raid)
+static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, int raid)
{
struct btrfs_block_group *bg;
int thresh_pct;
@@ -2031,7 +2030,6 @@ again:
}
up_read(&space_info->groups_sem);
- return 0;
}
void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
@@ -2074,21 +2072,15 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
return ret;
}
-int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
+void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
{
- int ret;
int raid;
struct btrfs_space_info *space_info;
list_for_each_entry(space_info, &fs_info->space_info, list) {
if (!btrfs_should_periodic_reclaim(space_info))
continue;
- for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
- ret = do_reclaim_sweep(fs_info, space_info, raid);
- if (ret)
- return ret;
- }
+ for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
+ do_reclaim_sweep(fs_info, space_info, raid);
}
-
- return ret;
}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 88b44221ce97..efbecc0c5258 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -217,7 +217,7 @@ struct reserve_ticket {
wait_queue_head_t wait;
};
-static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
@@ -258,7 +258,7 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
-u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -271,7 +271,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
+ const struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
@@ -293,7 +293,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
-int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info);
-int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info);
+int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
+void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 8ddd5fcbeb93..fe4d719d506b 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -64,6 +64,7 @@
* This means a slightly higher tree locking latency.
*/
+#if PAGE_SIZE > SZ_4K
bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
{
if (fs_info->sectorsize >= PAGE_SIZE)
@@ -85,37 +86,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space
return true;
return false;
}
-
-void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
-{
- unsigned int cur = 0;
- unsigned int nr_bits;
-
- ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
-
- nr_bits = PAGE_SIZE / sectorsize;
- subpage_info->bitmap_nr_bits = nr_bits;
-
- subpage_info->uptodate_offset = cur;
- cur += nr_bits;
-
- subpage_info->dirty_offset = cur;
- cur += nr_bits;
-
- subpage_info->writeback_offset = cur;
- cur += nr_bits;
-
- subpage_info->ordered_offset = cur;
- cur += nr_bits;
-
- subpage_info->checked_offset = cur;
- cur += nr_bits;
-
- subpage_info->locked_offset = cur;
- cur += nr_bits;
-
- subpage_info->total_nr_bits = cur;
-}
+#endif
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct folio *folio, enum btrfs_subpage_type type)
@@ -163,7 +134,7 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(fs_info->sectorsize < PAGE_SIZE);
real_size = struct_size(ret, bitmaps,
- BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+ BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page));
ret = kzalloc(real_size, GFP_NOFS);
if (!ret)
return ERR_PTR(-ENOMEM);
@@ -246,7 +217,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
\
btrfs_subpage_assert(fs_info, folio, start, len); \
__start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- __start_bit += fs_info->subpage_info->name##_offset; \
+ __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \
__start_bit; \
})
@@ -351,6 +322,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
unsigned long flags;
+ unsigned int cleared = 0;
+ int bit = start_bit;
bool last;
btrfs_subpage_assert(fs_info, folio, start, len);
@@ -368,11 +341,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
return true;
}
- ASSERT(atomic_read(&subpage->writers) >= nbits);
- /* The target range should have been locked. */
- ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
- bitmap_clear(subpage->bitmaps, start_bit, nbits);
- last = atomic_sub_and_test(nbits, &subpage->writers);
+ for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) {
+ clear_bit(bit, subpage->bitmaps);
+ cleared++;
+ }
+ ASSERT(atomic_read(&subpage->writers) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->writers);
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
}
@@ -404,27 +378,94 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * Handle different locked folios:
+ *
+ * - Non-subpage folio
+ * Just unlock it.
+ *
+ * - folio locked but without any subpage locked
+ * This happens either before writepage_delalloc() or the delalloc range is
+ * already handled by previous folio.
+ * We can simple unlock it.
+ *
+ * - folio locked with subpage range locked.
+ * We go through the locked sectors inside the range and clear their locked
+ * bitmap, reduce the writer lock number, and unlock the page if that's
+ * the last locked range.
+ */
void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+
+ ASSERT(folio_test_locked(folio));
+
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
folio_unlock(folio);
return;
}
+
+ /*
+ * For subpage case, there are two types of locked page. With or
+ * without writers number.
+ *
+ * Since we own the page lock, no one else could touch subpage::writers
+ * and we are safe to do several atomic operations without spinlock.
+ */
+ if (atomic_read(&subpage->writers) == 0) {
+ /* No writers, locked by plain lock_page(). */
+ folio_unlock(folio);
+ return;
+ }
+
btrfs_subpage_clamp_range(folio, &start, &len);
if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
folio_unlock(folio);
}
+void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap)
+{
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
+ unsigned long flags;
+ bool last = false;
+ int cleared = 0;
+ int bit;
+
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+
+ if (atomic_read(&subpage->writers) == 0) {
+ /* No writers, locked by plain lock_page(). */
+ folio_unlock(folio);
+ return;
+ }
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) {
+ if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
+ cleared++;
+ }
+ ASSERT(atomic_read(&subpage->writers) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->writers);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ if (last)
+ folio_unlock(folio);
+}
+
#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
bitmap_test_range_all_set(subpage->bitmaps, \
- fs_info->subpage_info->name##_offset, \
- fs_info->subpage_info->bitmap_nr_bits)
+ fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
+ fs_info->sectors_per_page)
#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
bitmap_test_range_all_zero(subpage->bitmaps, \
- fs_info->subpage_info->name##_offset, \
- fs_info->subpage_info->bitmap_nr_bits)
+ fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
+ fs_info->sectors_per_page)
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
@@ -729,53 +770,6 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
}
/*
- * Handle different locked pages with different page sizes:
- *
- * - Page locked by plain lock_page()
- * It should not have any subpage::writers count.
- * Can be unlocked by unlock_page().
- * This is the most common locked page for __extent_writepage() called
- * inside extent_write_cache_pages().
- * Rarer cases include the @locked_page from extent_write_locked_range().
- *
- * - Page locked by lock_delalloc_pages()
- * There is only one caller, all pages except @locked_page for
- * extent_write_locked_range().
- * In this case, we have to call subpage helper to handle the case.
- */
-void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage;
-
- ASSERT(folio_test_locked(folio));
- /* For non-subpage case, we just unlock the page */
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
- folio_unlock(folio);
- return;
- }
-
- ASSERT(folio_test_private(folio) && folio_get_private(folio));
- subpage = folio_get_private(folio);
-
- /*
- * For subpage case, there are two types of locked page. With or
- * without writers number.
- *
- * Since we own the page lock, no one else could touch subpage::writers
- * and we are safe to do several atomic operations without spinlock.
- */
- if (atomic_read(&subpage->writers) == 0) {
- /* No writers, locked by plain lock_page() */
- folio_unlock(folio);
- return;
- }
-
- /* Have writers, use proper subpage helper to end it */
- btrfs_folio_end_writer_lock(fs_info, folio, start, len);
-}
-
-/*
* This is for folio already locked by plain lock_page()/folio_lock(), which
* doesn't have any subpage awareness.
*
@@ -803,7 +797,7 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->writers);
- ASSERT(ret <= fs_info->subpage_info->bitmap_nr_bits);
+ ASSERT(ret <= fs_info->sectors_per_page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -819,14 +813,13 @@ bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 search_start,
u64 *found_start_ret, u32 *found_len_ret)
{
- struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage = folio_get_private(folio);
+ const u32 sectors_per_page = fs_info->sectors_per_page;
const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
locked, search_start, len);
- const unsigned int locked_bitmap_start = subpage_info->locked_offset;
- const unsigned int locked_bitmap_end = locked_bitmap_start +
- subpage_info->bitmap_nr_bits;
+ const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked;
+ const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page;
unsigned long flags;
int first_zero;
int first_set;
@@ -855,59 +848,21 @@ out:
return found;
}
-/*
- * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range,
- * this ends all writer locked ranges of a page.
- *
- * This is for the locked page of __extent_writepage(), as the locked page
- * can contain several locked subpage ranges.
- */
-void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- u64 folio_start = folio_pos(folio);
- u64 cur = folio_start;
-
- ASSERT(folio_test_locked(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
- folio_unlock(folio);
- return;
- }
-
- /* The page has no new delalloc range locked on it. Just plain unlock. */
- if (atomic_read(&subpage->writers) == 0) {
- folio_unlock(folio);
- return;
- }
- while (cur < folio_start + PAGE_SIZE) {
- u64 found_start;
- u32 found_len;
- bool found;
- bool last;
-
- found = btrfs_subpage_find_writer_locked(fs_info, folio, cur,
- &found_start, &found_len);
- if (!found)
- break;
- last = btrfs_subpage_end_and_test_writer(fs_info, folio,
- found_start, found_len);
- if (last) {
- folio_unlock(folio);
- break;
- }
- cur = found_start + found_len;
- }
+#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
+{ \
+ const int sectors_per_page = fs_info->sectors_per_page; \
+ \
+ ASSERT(sectors_per_page < BITS_PER_LONG); \
+ *dst = bitmap_read(subpage->bitmaps, \
+ sectors_per_page * btrfs_bitmap_nr_##name, \
+ sectors_per_page); \
}
-#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
- bitmap_cut(dst, subpage->bitmaps, 0, \
- subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
-
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
+ const u32 sectors_per_page = fs_info->sectors_per_page;
unsigned long uptodate_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
@@ -916,25 +871,41 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(subpage_info);
+ ASSERT(sectors_per_page > 1);
subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
- GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
- subpage_info->bitmap_nr_bits, &uptodate_bitmap,
- subpage_info->bitmap_nr_bits, &dirty_bitmap,
- subpage_info->bitmap_nr_bits, &writeback_bitmap,
- subpage_info->bitmap_nr_bits, &ordered_bitmap,
- subpage_info->bitmap_nr_bits, &checked_bitmap);
+ sectors_per_page, &uptodate_bitmap,
+ sectors_per_page, &dirty_bitmap,
+ sectors_per_page, &writeback_bitmap,
+ sectors_per_page, &ordered_bitmap,
+ sectors_per_page, &checked_bitmap);
+}
+
+void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+ struct folio *folio,
+ unsigned long *ret_bitmap)
+{
+ struct btrfs_subpage *subpage;
+ unsigned long flags;
+
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ ASSERT(fs_info->sectors_per_page > 1);
+ subpage = folio_get_private(folio);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap);
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 249396e118d0..4b85d91d0e18 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -5,6 +5,7 @@
#include <linux/spinlock.h>
#include <linux/atomic.h>
+#include <linux/sizes.h>
struct address_space;
struct folio;
@@ -18,39 +19,23 @@ struct btrfs_fs_info;
*
* This structure records how they are organized in the bitmap:
*
- * /- uptodate_offset /- dirty_offset /- ordered_offset
+ * /- uptodate /- dirty /- ordered
* | | |
* v v v
* |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o|
- * |<- bitmap_nr_bits ->|
- * |<----------------- total_nr_bits ------------------>|
+ * |< sectors_per_page >|
+ *
+ * Unlike regular macro-like enums, here we do not go upper-case names, as
+ * these names will be utilized in various macros to define function names.
*/
-struct btrfs_subpage_info {
- /* Number of bits for each bitmap */
- unsigned int bitmap_nr_bits;
-
- /* Total number of bits for the whole bitmap */
- unsigned int total_nr_bits;
-
- /*
- * *_offset indicates where the bitmap starts, the length is always
- * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
- */
- unsigned int uptodate_offset;
- unsigned int dirty_offset;
- unsigned int writeback_offset;
- unsigned int ordered_offset;
- unsigned int checked_offset;
-
- /*
- * For locked bitmaps, normally it's subpage representation for folio
- * Locked flag, but metadata is different:
- *
- * - Metadata doesn't really lock the folio
- * It's just to prevent page::private get cleared before the last
- * end_page_read().
- */
- unsigned int locked_offset;
+enum {
+ btrfs_bitmap_nr_uptodate = 0,
+ btrfs_bitmap_nr_dirty,
+ btrfs_bitmap_nr_writeback,
+ btrfs_bitmap_nr_ordered,
+ btrfs_bitmap_nr_checked,
+ btrfs_bitmap_nr_locked,
+ btrfs_bitmap_nr_max
};
/*
@@ -88,9 +73,16 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
+#if PAGE_SIZE > SZ_4K
bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
+#else
+static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
+ struct address_space *mapping)
+{
+ return false;
+}
+#endif
-void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct folio *folio, enum btrfs_subpage_type type);
void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
@@ -114,10 +106,11 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap);
bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 search_start,
u64 *found_start_ret, u32 *found_len_ret);
-void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio);
/*
* Template for subpage related operations.
@@ -164,8 +157,9 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
-void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
+void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
+ struct folio *folio,
+ unsigned long *ret_bitmap);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 08d33cb372fb..98fa0f382480 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,6 +28,7 @@
#include <linux/btrfs.h>
#include <linux/security.h>
#include <linux/fs_parser.h>
+#include <linux/swap.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -683,8 +684,11 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
ret = false;
if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
- if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
btrfs_info(info, "disk space caching is enabled");
+ btrfs_warn(info,
+"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
+ }
if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
btrfs_info(info, "using free-space-tree");
}
@@ -2398,7 +2402,13 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
trace_btrfs_extent_map_shrinker_count(fs_info, nr);
- return nr;
+ /*
+ * Only report the real number for DEBUG builds, as there are reports of
+ * serious performance degradation caused by too frequent shrinks.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return nr;
+ return 0;
}
static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
@@ -2406,6 +2416,15 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ /*
+ * We may be called from any task trying to allocate memory and we don't
+ * want to slow it down with scanning and dropping extent maps. It would
+ * also cause heavy lock contention if many tasks concurrently enter
+ * here. Therefore only allow kswapd tasks to scan and drop extent maps.
+ */
+ if (!current_is_kswapd())
+ return 0;
+
return btrfs_free_extent_maps(fs_info, nr_to_scan);
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 865d4af4b303..0a2dbfaaf49e 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -180,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
start = 0;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("should have found at least one delalloc");
@@ -211,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("couldn't find delalloc in our range");
@@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
}
start = test_start;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
@@ -266,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("didn't find our range");
@@ -307,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = find_lock_delalloc_range(inode, locked_page, &start,
+ found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
&end);
if (!found) {
test_err("didn't find our range");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5e6fff8e1003..0fc873af891f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -143,8 +143,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(
&transaction->delayed_refs.href_root.rb_root));
- WARN_ON(!RB_EMPTY_ROOT(
- &transaction->delayed_refs.dirty_extent_root));
+ WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -351,7 +350,7 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
- cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
+ xa_init(&cur_trans->delayed_refs.dirty_extents);
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
/*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 98c03ddc760b..dd9ce9b9f69e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,6 +27,12 @@ struct btrfs_root_item;
struct btrfs_root;
struct btrfs_path;
+/*
+ * Signal that a direct IO write is in progress, to avoid deadlock for sync
+ * direct IO writes when fsync is called during the direct IO write path.
+ */
+#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
+
/* Radix-tree tag for roots that are part of the trasaction. */
#define BTRFS_ROOT_TRANS_TAG 0
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a825fa598e3c..634d69964fe4 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -569,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf,
/* dir type check */
dir_type = btrfs_dir_ftype(leaf, di);
- if (unlikely(dir_type >= BTRFS_FT_MAX)) {
+ if (unlikely(dir_type <= BTRFS_FT_UNKNOWN ||
+ dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot,
- "invalid dir item type, have %u expect [0, %u)",
+ "invalid dir item type, have %u expect (0, %u)",
dir_type, BTRFS_FT_MAX);
return -EUCLEAN;
}
@@ -1763,6 +1764,72 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf,
return 0;
}
+static int check_dev_extent_item(const struct extent_buffer *leaf,
+ const struct btrfs_key *key,
+ int slot,
+ struct btrfs_key *prev_key)
+{
+ struct btrfs_dev_extent *de;
+ const u32 sectorsize = leaf->fs_info->sectorsize;
+
+ de = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+ /* Basic fixed member checks. */
+ if (unlikely(btrfs_dev_extent_chunk_tree(leaf, de) !=
+ BTRFS_CHUNK_TREE_OBJECTID)) {
+ generic_err(leaf, slot,
+ "invalid dev extent chunk tree id, has %llu expect %llu",
+ btrfs_dev_extent_chunk_tree(leaf, de),
+ BTRFS_CHUNK_TREE_OBJECTID);
+ return -EUCLEAN;
+ }
+ if (unlikely(btrfs_dev_extent_chunk_objectid(leaf, de) !=
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
+ generic_err(leaf, slot,
+ "invalid dev extent chunk objectid, has %llu expect %llu",
+ btrfs_dev_extent_chunk_objectid(leaf, de),
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ return -EUCLEAN;
+ }
+ /* Alignment check. */
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
+ generic_err(leaf, slot,
+ "invalid dev extent key.offset, has %llu not aligned to %u",
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(btrfs_dev_extent_chunk_offset(leaf, de),
+ sectorsize))) {
+ generic_err(leaf, slot,
+ "invalid dev extent chunk offset, has %llu not aligned to %u",
+ btrfs_dev_extent_chunk_objectid(leaf, de),
+ sectorsize);
+ return -EUCLEAN;
+ }
+ if (unlikely(!IS_ALIGNED(btrfs_dev_extent_length(leaf, de),
+ sectorsize))) {
+ generic_err(leaf, slot,
+ "invalid dev extent length, has %llu not aligned to %u",
+ btrfs_dev_extent_length(leaf, de), sectorsize);
+ return -EUCLEAN;
+ }
+ /* Overlap check with previous dev extent. */
+ if (slot && prev_key->objectid == key->objectid &&
+ prev_key->type == key->type) {
+ struct btrfs_dev_extent *prev_de;
+ u64 prev_len;
+
+ prev_de = btrfs_item_ptr(leaf, slot - 1, struct btrfs_dev_extent);
+ prev_len = btrfs_dev_extent_length(leaf, prev_de);
+ if (unlikely(prev_key->offset + prev_len > key->offset)) {
+ generic_err(leaf, slot,
+ "dev extent overlap, prev offset %llu len %llu current offset %llu",
+ prev_key->objectid, prev_len, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ return 0;
+}
+
/*
* Common point to switch the item-specific validation.
*/
@@ -1799,6 +1866,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
case BTRFS_DEV_ITEM_KEY:
ret = check_dev_item(leaf, key, slot);
break;
+ case BTRFS_DEV_EXTENT_KEY:
+ ret = check_dev_extent_item(leaf, key, slot, prev_key);
+ break;
case BTRFS_INODE_ITEM_KEY:
ret = check_inode_item(leaf, key, slot);
break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f0cf8ce26f01..e2ed2a791f8f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2877,7 +2877,7 @@ void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_extent *tmp;
- ASSERT(inode_is_locked(&ctx->inode->vfs_inode));
+ btrfs_assert_inode_locked(ctx->inode);
list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
list_del_init(&ordered->log_list);
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index fa45b5fb9683..b382a4c443d4 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -170,7 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
* this until all tree mod log insertions are recorded in the rb tree and then
* write unlock fs_info::tree_mod_log_lock.
*/
-static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
@@ -188,7 +188,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffe
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
+ const struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
@@ -198,7 +198,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
return true;
}
-static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
+static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
int slot,
enum btrfs_mod_log_op op)
{
@@ -221,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
return tm;
}
-int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op)
{
struct tree_mod_elem *tm;
@@ -258,7 +258,7 @@ out_unlock:
return ret;
}
-static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
+static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items)
{
@@ -278,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
return tm;
}
-int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items)
{
@@ -535,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
}
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src,
+ const struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset,
int nr_items)
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index ff00c8e8a393..6308c577a4a4 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -37,7 +37,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
struct extent_buffer *new_root,
bool log_removal);
-int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
@@ -47,11 +47,11 @@ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src,
+ const struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset,
int nr_items);
-int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items);
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index eae75bb572b9..c6399513c66f 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -3,6 +3,7 @@
* Copyright (C) STRATO AG 2013. All rights reserved.
*/
+#include <linux/kthread.h>
#include <linux/uuid.h>
#include <asm/unaligned.h>
#include "messages.h"
@@ -12,6 +13,7 @@
#include "fs.h"
#include "accessors.h"
#include "uuid-tree.h"
+#include "ioctl.h"
static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key)
{
@@ -390,3 +392,180 @@ out:
btrfs_free_path(path);
return ret;
}
+
+int btrfs_uuid_scan_kthread(void *data)
+{
+ struct btrfs_fs_info *fs_info = data;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+ struct extent_buffer *eb;
+ int slot;
+ struct btrfs_root_item root_item;
+ u32 item_size;
+ struct btrfs_trans_handle *trans = NULL;
+ bool closing = false;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ if (btrfs_fs_closing(fs_info)) {
+ closing = true;
+ break;
+ }
+ ret = btrfs_search_forward(root, &key, path,
+ BTRFS_OLDEST_GENERATION);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+ if (key.type != BTRFS_ROOT_ITEM_KEY ||
+ (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+ key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+ key.objectid > BTRFS_LAST_FREE_OBJECTID)
+ goto skip;
+
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size(eb, slot);
+ if (item_size < sizeof(root_item))
+ goto skip;
+
+ read_extent_buffer(eb, &root_item,
+ btrfs_item_ptr_offset(eb, slot),
+ (int)sizeof(root_item));
+ if (btrfs_root_refs(&root_item) == 0)
+ goto skip;
+
+ if (!btrfs_is_empty_uuid(root_item.uuid) ||
+ !btrfs_is_empty_uuid(root_item.received_uuid)) {
+ if (trans)
+ goto update_tree;
+
+ btrfs_release_path(path);
+ /*
+ * 1 - subvol uuid item
+ * 1 - received_subvol uuid item
+ */
+ trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ continue;
+ } else {
+ goto skip;
+ }
+update_tree:
+ btrfs_release_path(path);
+ if (!btrfs_is_empty_uuid(root_item.uuid)) {
+ ret = btrfs_uuid_tree_add(trans, root_item.uuid,
+ BTRFS_UUID_KEY_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+ if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+ ret = btrfs_uuid_tree_add(trans,
+ root_item.received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ key.objectid);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
+ ret);
+ break;
+ }
+ }
+
+skip:
+ btrfs_release_path(path);
+ if (trans) {
+ ret = btrfs_end_transaction(trans);
+ trans = NULL;
+ if (ret)
+ break;
+ }
+
+ if (key.offset < (u64)-1) {
+ key.offset++;
+ } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ } else if (key.objectid < (u64)-1) {
+ key.offset = 0;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.objectid++;
+ } else {
+ break;
+ }
+ cond_resched();
+ }
+
+out:
+ btrfs_free_path(path);
+ if (trans && !IS_ERR(trans))
+ btrfs_end_transaction(trans);
+ if (ret)
+ btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
+ else if (!closing)
+ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
+ up(&fs_info->uuid_tree_rescan_sem);
+ return 0;
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *uuid_root;
+ struct task_struct *task;
+ int ret;
+
+ /*
+ * 1 - root node
+ * 1 - root item
+ */
+ trans = btrfs_start_transaction(tree_root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
+ if (IS_ERR(uuid_root)) {
+ ret = PTR_ERR(uuid_root);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ fs_info->uuid_root = uuid_root;
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ return ret;
+
+ down(&fs_info->uuid_tree_rescan_sem);
+ task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+ if (IS_ERR(task)) {
+ /* fs_info->update_uuid_tree_gen remains 0 in all error case */
+ btrfs_warn(fs_info, "failed to start uuid_scan task");
+ up(&fs_info->uuid_tree_rescan_sem);
+ return PTR_ERR(task);
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
index a3f5757cc7cf..c60ad20325cc 100644
--- a/fs/btrfs/uuid-tree.h
+++ b/fs/btrfs/uuid-tree.h
@@ -13,5 +13,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ
int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_uuid_scan_kthread(void *data);
#endif
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 4042dd6437ae..e97ad824ae16 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -284,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
* page and ignore dest, but it must still be non-NULL to avoid the
* counting-only behavior.
* @len: length in bytes to read
- * @dest_page: copy into this page instead of the dest buffer
+ * @dest_folio: copy into this folio instead of the dest buffer
*
* Helper function to read items from the btree. This returns the number of
* bytes read or < 0 for errors. We can return short reads if the items don't
@@ -294,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
* Returns number of bytes read or a negative error code on failure.
*/
static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
- char *dest, u64 len, struct page *dest_page)
+ char *dest, u64 len, struct folio *dest_folio)
{
struct btrfs_path *path;
struct btrfs_root *root = inode->root;
@@ -314,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
if (!path)
return -ENOMEM;
- if (dest_page)
+ if (dest_folio)
path->reada = READA_FORWARD;
key.objectid = btrfs_ino(inode);
@@ -371,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset,
copy_offset = offset - key.offset;
if (dest) {
- if (dest_page)
- kaddr = kmap_local_page(dest_page);
+ if (dest_folio)
+ kaddr = kmap_local_folio(dest_folio, 0);
data = btrfs_item_ptr(leaf, path->slots[0], void);
read_extent_buffer(leaf, kaddr + dest_offset,
(unsigned long)data + copy_offset,
copy_bytes);
- if (dest_page)
+ if (dest_folio)
kunmap_local(kaddr);
}
@@ -460,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
int ret;
- ASSERT(inode_is_locked(&inode->vfs_inode));
+ btrfs_assert_inode_locked(inode);
truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size);
clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags);
ret = btrfs_drop_verity_items(inode);
@@ -585,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp)
struct btrfs_trans_handle *trans;
int ret;
- ASSERT(inode_is_locked(file_inode(filp)));
+ btrfs_assert_inode_locked(inode);
if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags))
return -EBUSY;
@@ -633,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc,
int ret = 0;
int rollback_ret;
- ASSERT(inode_is_locked(file_inode(filp)));
+ btrfs_assert_inode_locked(inode);
if (desc == NULL)
goto rollback;
@@ -762,7 +762,7 @@ again:
* [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ]
*/
ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off,
- folio_address(folio), PAGE_SIZE, &folio->page);
+ folio_address(folio), PAGE_SIZE, folio);
if (ret < 0) {
folio_put(folio);
return ERR_PTR(ret);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fcedc43ef291..8f340ad1d938 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -476,6 +476,8 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
if (IS_ERR(*bdev_file)) {
ret = PTR_ERR(*bdev_file);
+ btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d",
+ device_path, flags, ret);
goto error;
}
bdev = file_bdev(*bdev_file);
@@ -4784,183 +4786,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_uuid_scan_kthread(void *data)
-{
- struct btrfs_fs_info *fs_info = data;
- struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_key key;
- struct btrfs_path *path = NULL;
- int ret = 0;
- struct extent_buffer *eb;
- int slot;
- struct btrfs_root_item root_item;
- u32 item_size;
- struct btrfs_trans_handle *trans = NULL;
- bool closing = false;
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- key.objectid = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = 0;
-
- while (1) {
- if (btrfs_fs_closing(fs_info)) {
- closing = true;
- break;
- }
- ret = btrfs_search_forward(root, &key, path,
- BTRFS_OLDEST_GENERATION);
- if (ret) {
- if (ret > 0)
- ret = 0;
- break;
- }
-
- if (key.type != BTRFS_ROOT_ITEM_KEY ||
- (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
- key.objectid != BTRFS_FS_TREE_OBJECTID) ||
- key.objectid > BTRFS_LAST_FREE_OBJECTID)
- goto skip;
-
- eb = path->nodes[0];
- slot = path->slots[0];
- item_size = btrfs_item_size(eb, slot);
- if (item_size < sizeof(root_item))
- goto skip;
-
- read_extent_buffer(eb, &root_item,
- btrfs_item_ptr_offset(eb, slot),
- (int)sizeof(root_item));
- if (btrfs_root_refs(&root_item) == 0)
- goto skip;
-
- if (!btrfs_is_empty_uuid(root_item.uuid) ||
- !btrfs_is_empty_uuid(root_item.received_uuid)) {
- if (trans)
- goto update_tree;
-
- btrfs_release_path(path);
- /*
- * 1 - subvol uuid item
- * 1 - received_subvol uuid item
- */
- trans = btrfs_start_transaction(fs_info->uuid_root, 2);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- continue;
- } else {
- goto skip;
- }
-update_tree:
- btrfs_release_path(path);
- if (!btrfs_is_empty_uuid(root_item.uuid)) {
- ret = btrfs_uuid_tree_add(trans, root_item.uuid,
- BTRFS_UUID_KEY_SUBVOL,
- key.objectid);
- if (ret < 0) {
- btrfs_warn(fs_info, "uuid_tree_add failed %d",
- ret);
- break;
- }
- }
-
- if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
- ret = btrfs_uuid_tree_add(trans,
- root_item.received_uuid,
- BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- key.objectid);
- if (ret < 0) {
- btrfs_warn(fs_info, "uuid_tree_add failed %d",
- ret);
- break;
- }
- }
-
-skip:
- btrfs_release_path(path);
- if (trans) {
- ret = btrfs_end_transaction(trans);
- trans = NULL;
- if (ret)
- break;
- }
-
- if (key.offset < (u64)-1) {
- key.offset++;
- } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
- key.offset = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- } else if (key.objectid < (u64)-1) {
- key.offset = 0;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.objectid++;
- } else {
- break;
- }
- cond_resched();
- }
-
-out:
- btrfs_free_path(path);
- if (trans && !IS_ERR(trans))
- btrfs_end_transaction(trans);
- if (ret)
- btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
- else if (!closing)
- set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
- up(&fs_info->uuid_tree_rescan_sem);
- return 0;
-}
-
-int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_root *uuid_root;
- struct task_struct *task;
- int ret;
-
- /*
- * 1 - root node
- * 1 - root item
- */
- trans = btrfs_start_transaction(tree_root, 2);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
- if (IS_ERR(uuid_root)) {
- ret = PTR_ERR(uuid_root);
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- return ret;
- }
-
- fs_info->uuid_root = uuid_root;
-
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
-
- down(&fs_info->uuid_tree_rescan_sem);
- task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
- if (IS_ERR(task)) {
- /* fs_info->update_uuid_tree_gen remains 0 in all error case */
- btrfs_warn(fs_info, "failed to start uuid_scan task");
- up(&fs_info->uuid_tree_rescan_sem);
- return PTR_ERR(task);
- }
-
- return 0;
-}
-
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -5956,11 +5781,31 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
write_unlock(&fs_info->mapping_tree_lock);
}
+static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map)
+{
+ enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+
+ /*
+ * There could be two corrupted data stripes, we need to loop retry in
+ * order to rebuild the correct data.
+ *
+ * Fail a stripe at a time on every retry except the stripe under
+ * reconstruction.
+ */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return map->num_stripes;
+
+ /* Non-RAID56, use their ncopies from btrfs_raid_array. */
+ return btrfs_raid_array[index].ncopies;
+}
+
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct btrfs_chunk_map *map;
- enum btrfs_raid_types index;
- int ret = 1;
+ int ret;
map = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(map))
@@ -5972,22 +5817,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
return 1;
- index = btrfs_bg_flags_to_raid_index(map->type);
-
- /* Non-RAID56, use their ncopies from btrfs_raid_array. */
- if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
- ret = btrfs_raid_array[index].ncopies;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
- ret = 2;
- else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- /*
- * There could be two corrupted data stripes, we need
- * to loop retry in order to rebuild the correct data.
- *
- * Fail a stripe at a time on every retry except the
- * stripe under reconstruction.
- */
- ret = map->num_stripes;
+ ret = btrfs_chunk_map_num_copies(map);
btrfs_free_chunk_map(map);
return ret;
}
@@ -6637,14 +6467,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom.stripe_index = 0;
io_geom.op = op;
- num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
- if (io_geom.mirror_num > num_copies)
- return -EINVAL;
-
map = btrfs_get_chunk_map(fs_info, logical, *length);
if (IS_ERR(map))
return PTR_ERR(map);
+ num_copies = btrfs_chunk_map_num_copies(map);
+ if (io_geom.mirror_num > num_copies)
+ return -EINVAL;
+
map_offset = logical - map->start;
io_geom.raid56_full_stripe_start = (u64)-1;
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 37a09ebb34dd..03d2d60afe0c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -444,7 +444,7 @@ struct btrfs_io_stripe {
/* Block mapping. */
u64 physical;
u64 length;
- bool is_scrub;
+ bool rst_search_commit_root;
/* For the endio handler. */
struct btrfs_io_context *bioc;
};
@@ -725,8 +725,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
-int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
-int btrfs_uuid_scan_kthread(void *data);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 738c7bb8ea7c..ce464cd8e0ac 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -120,7 +120,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
* locks the inode's i_mutex before calling setxattr or removexattr.
*/
if (flags & XATTR_REPLACE) {
- ASSERT(inode_is_locked(inode));
+ btrfs_assert_inode_locked(BTRFS_I(inode));
di = btrfs_lookup_xattr(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), name, name_len, 0);
if (!di)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 30971dd741e2..100abc00b794 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -20,6 +20,8 @@
#include <linux/refcount.h>
#include "btrfs_inode.h"
#include "compression.h"
+#include "fs.h"
+#include "subpage.h"
/* workspace buffer size for s390 zlib hardware support */
#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE)
@@ -108,6 +110,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ const u64 orig_end = start + len;
*out_folios = 0;
*total_out = 0;
@@ -153,6 +156,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
if (in_buf_folios > 1) {
int i;
+ /* S390 hardware acceleration path, not subpage. */
+ ASSERT(!btrfs_is_subpage(
+ inode_to_fs_info(mapping->host),
+ mapping));
for (i = 0; i < in_buf_folios; i++) {
if (data_in) {
kunmap_local(data_in);
@@ -167,9 +174,14 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
copy_page(workspace->buf + i * PAGE_SIZE,
data_in);
start += PAGE_SIZE;
+ workspace->strm.avail_in =
+ (in_buf_folios << PAGE_SHIFT);
}
workspace->strm.next_in = workspace->buf;
} else {
+ unsigned int pg_off;
+ unsigned int cur_len;
+
if (data_in) {
kunmap_local(data_in);
folio_put(in_folio);
@@ -179,12 +191,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
start, &in_folio);
if (ret < 0)
goto out;
- data_in = kmap_local_folio(in_folio, 0);
+ pg_off = offset_in_page(start);
+ cur_len = btrfs_calc_input_length(orig_end, start);
+ data_in = kmap_local_folio(in_folio, pg_off);
start += PAGE_SIZE;
workspace->strm.next_in = data_in;
+ workspace->strm.avail_in = cur_len;
}
- workspace->strm.avail_in = min(bytes_left,
- (unsigned long) workspace->buf_size);
}
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
@@ -380,7 +393,7 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -408,12 +421,12 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
ret = zlib_inflateInit2(&workspace->strm, wbits);
if (unlikely(ret != Z_OK)) {
- struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(inode->root->fs_info,
"zlib decompression init failed, error %d root %llu inode %llu offset %llu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
- page_offset(dest_page));
+ folio_pos(dest_folio));
return -EIO;
}
@@ -426,16 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
if (ret != Z_STREAM_END)
goto out;
- memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
+ memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy);
out:
if (unlikely(to_copy != destlen)) {
- struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(inode->root->fs_info,
"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu",
ret, btrfs_root_id(inode->root), btrfs_ino(inode),
- page_offset(dest_page), to_copy, destlen);
+ folio_pos(dest_folio), to_copy, destlen);
ret = -EIO;
} else {
ret = 0;
@@ -444,7 +457,7 @@ out:
zlib_inflateEnd(&workspace->strm);
if (unlikely(to_copy < destlen))
- memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
+ folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 66f63e82af79..7fa2920632ba 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -287,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
/* The emulated zone size is determined from the size of device extent */
static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -304,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out;
+ return ret;
/* No dev extents at all? Not good */
- if (ret > 0) {
- ret = -EUCLEAN;
- goto out;
- }
+ if (ret > 0)
+ return -EUCLEAN;
}
leaf = path->nodes[0];
dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
- ret = 0;
-
-out:
- btrfs_free_path(path);
-
- return ret;
+ return 0;
}
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@@ -1211,7 +1204,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
@@ -1246,7 +1239,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
if (!ret)
ret = -EUCLEAN;
if (ret < 0)
- goto out;
+ return ret;
ret = btrfs_previous_extent_item(root, path, cache->start);
if (ret) {
@@ -1254,7 +1247,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
ret = 0;
*offset_ret = 0;
}
- goto out;
+ return ret;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
@@ -1266,15 +1259,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache,
if (!(found_key.objectid >= cache->start &&
found_key.objectid + length <= cache->start + cache->length)) {
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
*offset_ret = found_key.objectid + length - cache->start;
- ret = 0;
-
-out:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
struct zone_info {
@@ -1406,6 +1394,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
return -EINVAL;
}
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
+
if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
btrfs_err(bg->fs_info,
"zoned: cannot recover write pointer for zone %llu",
@@ -1432,7 +1422,6 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
}
bg->alloc_offset = zone_info[0].alloc_offset;
- bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
return 0;
}
@@ -1450,6 +1439,9 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
return -EINVAL;
}
+ /* In case a device is missing we have a cap of 0, so don't use it. */
+ bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
+
for (i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
zone_info[i].alloc_offset == WP_CONVENTIONAL)
@@ -1471,9 +1463,6 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
if (test_bit(0, active))
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
- /* In case a device is missing we have a cap of 0, so don't use it. */
- bg->zone_capacity = min_not_zero(zone_info[0].capacity,
- zone_info[1].capacity);
}
if (zone_info[0].alloc_offset != WP_MISSING_DEV)
@@ -1563,6 +1552,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
+ u64 profile;
if (!btrfs_is_zoned(fs_info))
return 0;
@@ -1623,7 +1613,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
}
- switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ switch (profile) {
case 0: /* single */
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
@@ -1650,6 +1641,23 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 &&
+ profile != BTRFS_BLOCK_GROUP_RAID10) {
+ /*
+ * Detected broken write pointer. Make this block group
+ * unallocatable by setting the allocation pointer at the end of
+ * allocatable region. Relocating this block group will fix the
+ * mismatch.
+ *
+ * Currently, we cannot handle RAID0 or RAID10 case like this
+ * because we don't have a proper zone_capacity value. But,
+ * reading from this block group won't work anyway by a missing
+ * stripe.
+ */
+ cache->alloc_offset = cache->zone_capacity;
+ ret = 0;
+ }
+
out:
/* Reject non SINGLE data profiles without RST */
if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
@@ -2439,7 +2447,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_devices->device_list_mutex);
}
-bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 30b2e48a1cec..7612e6572605 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -89,7 +89,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
-bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
@@ -242,7 +242,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
-static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
return false;
}
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 2a079561b2b1..866607fd3e58 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -389,7 +389,10 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
unsigned long tot_out = 0;
unsigned long len = *total_out;
const unsigned long nr_dest_folios = *out_folios;
+ const u64 orig_end = start + len;
unsigned long max_out = nr_dest_folios * PAGE_SIZE;
+ unsigned int pg_off;
+ unsigned int cur_len;
zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
len);
@@ -415,9 +418,11 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- workspace->in_buf.src = kmap_local_folio(in_folio, 0);
+ pg_off = offset_in_page(start);
+ cur_len = btrfs_calc_input_length(orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->in_buf.size = cur_len;
/* Allocate and map in the output buffer */
out_folio = btrfs_alloc_compr_folio();
@@ -494,14 +499,16 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
kunmap_local(workspace->in_buf.src);
workspace->in_buf.src = NULL;
folio_put(in_folio);
- start += PAGE_SIZE;
- len -= PAGE_SIZE;
+ start += cur_len;
+ len -= cur_len;
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- workspace->in_buf.src = kmap_local_folio(in_folio, 0);
+ pg_off = offset_in_page(start);
+ cur_len = btrfs_calc_input_length(orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
workspace->in_buf.pos = 0;
- workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
+ workspace->in_buf.size = cur_len;
}
}
while (1) {
@@ -649,11 +656,11 @@ done:
}
int zstd_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
+ struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
- struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb);
const u32 sectorsize = fs_info->sectorsize;
zstd_dstream *stream;
int ret = 0;
@@ -662,12 +669,12 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
if (unlikely(!stream)) {
- struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(inode->root->fs_info,
"zstd decompression init failed, root %llu inode %llu offset %llu",
btrfs_root_id(inode->root), btrfs_ino(inode),
- page_offset(dest_page));
+ folio_pos(dest_folio));
ret = -EIO;
goto finish;
}
@@ -686,21 +693,21 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
*/
ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf);
if (unlikely(zstd_is_error(ret))) {
- struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+ struct btrfs_inode *inode = folio_to_inode(dest_folio);
btrfs_err(inode->root->fs_info,
"zstd decompression failed, error %d root %llu inode %llu offset %llu",
zstd_get_error_code(ret), btrfs_root_id(inode->root),
- btrfs_ino(inode), page_offset(dest_page));
+ btrfs_ino(inode), folio_pos(dest_folio));
goto finish;
}
to_copy = workspace->out_buf.pos;
- memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy);
+ memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy);
finish:
/* Error or early end. */
if (unlikely(to_copy < destlen)) {
ret = -EIO;
- memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
+ folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy);
}
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index e55ad471c530..1fc9a50def0b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -774,12 +774,11 @@ EXPORT_SYMBOL(block_dirty_folio);
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
struct buffer_head *bh;
- struct list_head tmp;
struct address_space *mapping;
int err = 0, err2;
struct blk_plug plug;
+ LIST_HEAD(tmp);
- INIT_LIST_HEAD(&tmp);
blk_start_plug(&plug);
spin_lock(lock);
@@ -958,12 +957,9 @@ no_grow:
}
EXPORT_SYMBOL_GPL(folio_alloc_buffers);
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
- bool retry)
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
{
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
- if (retry)
- gfp |= __GFP_NOFAIL;
return folio_alloc_buffers(page_folio(page), size, gfp);
}
@@ -2168,11 +2164,10 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
return err;
}
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block)
{
- return __block_write_begin_int(page_folio(page), pos, len, get_block,
- NULL);
+ return __block_write_begin_int(folio, pos, len, get_block, NULL);
}
EXPORT_SYMBOL(__block_write_begin);
@@ -2222,33 +2217,33 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to)
* The filesystem needs to handle block truncation upon failure.
*/
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
- struct page **pagep, get_block_t *get_block)
+ struct folio **foliop, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
int status;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- status = __block_write_begin(page, pos, len, get_block);
+ status = __block_write_begin_int(folio, pos, len, get_block, NULL);
if (unlikely(status)) {
- unlock_page(page);
- put_page(page);
- page = NULL;
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- *pagep = page;
+ *foliop = folio;
return status;
}
EXPORT_SYMBOL(block_write_begin);
int block_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
size_t start = pos - folio_pos(folio);
if (unlikely(copied < len)) {
@@ -2280,19 +2275,19 @@ EXPORT_SYMBOL(block_write_end);
int generic_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool i_size_changed = false;
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
/*
* No need to use i_size_read() here, the i_size cannot change under us
* because we hold i_rwsem.
*
- * But it's important to update i_size while still holding page lock:
+ * But it's important to update i_size while still holding folio lock:
* page writeout could otherwise come in and zero beyond i_size.
*/
if (pos + copied > inode->i_size) {
@@ -2300,8 +2295,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
i_size_changed = true;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -2467,7 +2462,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
int err;
@@ -2475,11 +2470,11 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
if (err)
goto out;
- err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
+ err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
if (err)
goto out;
- err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
+ err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
BUG_ON(err > 0);
out:
@@ -2493,7 +2488,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
const struct address_space_operations *aops = mapping->a_ops;
unsigned int blocksize = i_blocksize(inode);
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
pgoff_t index, curidx;
loff_t curpos;
@@ -2512,12 +2507,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
len = PAGE_SIZE - zerofrom;
err = aops->write_begin(file, mapping, curpos, len,
- &page, &fsdata);
+ &folio, &fsdata);
if (err)
goto out;
- zero_user(page, zerofrom, len);
+ folio_zero_range(folio, offset_in_folio(folio, curpos), len);
err = aops->write_end(file, mapping, curpos, len, len,
- page, fsdata);
+ folio, fsdata);
if (err < 0)
goto out;
BUG_ON(err != len);
@@ -2545,12 +2540,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
len = offset - zerofrom;
err = aops->write_begin(file, mapping, curpos, len,
- &page, &fsdata);
+ &folio, &fsdata);
if (err)
goto out;
- zero_user(page, zerofrom, len);
+ folio_zero_range(folio, offset_in_folio(folio, curpos), len);
err = aops->write_end(file, mapping, curpos, len, len,
- page, fsdata);
+ folio, fsdata);
if (err < 0)
goto out;
BUG_ON(err != len);
@@ -2566,7 +2561,7 @@ out:
*/
int cont_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata,
+ struct folio **foliop, void **fsdata,
get_block_t *get_block, loff_t *bytes)
{
struct inode *inode = mapping->host;
@@ -2584,7 +2579,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
(*bytes)++;
}
- return block_write_begin(mapping, pos, len, pagep, get_block);
+ return block_write_begin(mapping, pos, len, foliop, get_block);
}
EXPORT_SYMBOL(cont_write_begin);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index a91acd03ee12..6a821a959b59 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -627,11 +627,12 @@ static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *wreq = subreq->rreq;
struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
_enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start);
- subreq->max_len = MAX_RW_COUNT;
- subreq->max_nr_segs = BIO_MAX_VECS;
+ stream->sreq_max_len = MAX_RW_COUNT;
+ stream->sreq_max_segs = BIO_MAX_VECS;
if (!cachefiles_cres_file(cres)) {
if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE))
@@ -647,6 +648,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
struct netfs_cache_resources *cres = &wreq->cache_resources;
struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
+ struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
const struct cred *saved_cred;
size_t off, pre, post, len = subreq->len;
loff_t start = subreq->start;
@@ -660,6 +662,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
if (off) {
pre = CACHEFILES_DIO_BLOCK_SIZE - off;
if (pre >= len) {
+ fscache_count_dio_misfit();
netfs_write_subrequest_terminated(subreq, len, false);
return;
}
@@ -670,10 +673,22 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
}
/* We also need to end on the cache granularity boundary */
+ if (start + len == wreq->i_size) {
+ size_t part = len % CACHEFILES_DIO_BLOCK_SIZE;
+ size_t need = CACHEFILES_DIO_BLOCK_SIZE - part;
+
+ if (part && stream->submit_extendable_to >= need) {
+ len += need;
+ subreq->len += need;
+ subreq->io_iter.count += need;
+ }
+ }
+
post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1);
if (post) {
len -= post;
if (len == 0) {
+ fscache_count_dio_misfit();
netfs_write_subrequest_terminated(subreq, post, false);
return;
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 4dd8a993c60a..7c6f260a3be5 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -64,9 +64,15 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object)
memcpy(buf->data, fscache_get_aux(object->cookie), len);
ret = cachefiles_inject_write_error();
- if (ret == 0)
- ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache,
- buf, sizeof(struct cachefiles_xattr) + len, 0);
+ if (ret == 0) {
+ ret = mnt_want_write_file(file);
+ if (ret == 0) {
+ ret = vfs_setxattr(&nop_mnt_idmap, dentry,
+ cachefiles_xattr_cache, buf,
+ sizeof(struct cachefiles_xattr) + len, 0);
+ mnt_drop_write_file(file);
+ }
+ }
if (ret < 0) {
trace_cachefiles_vfs_error(object, file_inode(file), ret,
cachefiles_trace_setxattr_error);
@@ -151,8 +157,14 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
int ret;
ret = cachefiles_inject_remove_error();
- if (ret == 0)
- ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache);
+ if (ret == 0) {
+ ret = mnt_want_write(cache->mnt);
+ if (ret == 0) {
+ ret = vfs_removexattr(&nop_mnt_idmap, dentry,
+ cachefiles_xattr_cache);
+ mnt_drop_write(cache->mnt);
+ }
+ }
if (ret < 0) {
trace_cachefiles_vfs_error(object, d_inode(dentry), ret,
cachefiles_trace_remxattr_error);
@@ -208,9 +220,15 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
memcpy(buf->data, p, volume->vcookie->coherency_len);
ret = cachefiles_inject_write_error();
- if (ret == 0)
- ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache,
- buf, len, 0);
+ if (ret == 0) {
+ ret = mnt_want_write(volume->cache->mnt);
+ if (ret == 0) {
+ ret = vfs_setxattr(&nop_mnt_idmap, dentry,
+ cachefiles_xattr_cache,
+ buf, len, 0);
+ mnt_drop_write(volume->cache->mnt);
+ }
+ }
if (ret < 0) {
trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret,
cachefiles_trace_setxattr_error);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8c16bc5250ef..5d9ccda098cc 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -13,6 +13,7 @@
#include <linux/iversion.h>
#include <linux/ktime.h>
#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "super.h"
#include "mds_client.h"
@@ -205,21 +206,6 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
}
}
-static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
-{
- struct inode *inode = subreq->rreq->inode;
- struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
- struct ceph_inode_info *ci = ceph_inode(inode);
- u64 objno, objoff;
- u32 xlen;
-
- /* Truncate the extent at the end of the current block */
- ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
- &objno, &objoff, &xlen);
- subreq->len = min(xlen, fsc->mount_options->rsize);
- return true;
-}
-
static void finish_netfs_read(struct ceph_osd_request *req)
{
struct inode *inode = req->r_inode;
@@ -246,7 +232,8 @@ static void finish_netfs_read(struct ceph_osd_request *req)
if (err >= 0) {
if (sparse && err > 0)
err = ceph_sparse_ext_map_end(op);
- if (err < subreq->len)
+ if (err < subreq->len &&
+ subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (IS_ENCRYPTED(inode) && err > 0) {
err = ceph_fscrypt_decrypt_extents(inode,
@@ -263,7 +250,12 @@ static void finish_netfs_read(struct ceph_osd_request *req)
calc_pages_for(osd_data->alignment,
osd_data->length), false);
}
- netfs_subreq_terminated(subreq, err, false);
+ if (err > 0) {
+ subreq->transferred = err;
+ err = 0;
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
+ netfs_read_subreq_terminated(subreq, err, false);
iput(req->r_inode);
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
@@ -277,12 +269,12 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct iov_iter iter;
ssize_t err = 0;
size_t len;
int mode;
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (rreq->origin != NETFS_DIO_READ)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
if (subreq->start >= inode->i_size)
@@ -299,6 +291,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
req->r_num_caps = 2;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0)
goto out;
@@ -312,17 +305,36 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
}
len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
- iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
- err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
- if (err == 0)
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter);
+ if (err == 0) {
err = -EFAULT;
+ } else {
+ subreq->transferred += err;
+ err = 0;
+ }
ceph_mdsc_put_request(req);
out:
- netfs_subreq_terminated(subreq, err, false);
+ netfs_read_subreq_terminated(subreq, err, false);
return true;
}
+static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ u64 objno, objoff;
+ u32 xlen;
+
+ /* Truncate the extent at the end of the current block */
+ ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
+ &objno, &objoff, &xlen);
+ rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize);
+ return 0;
+}
+
static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
@@ -332,9 +344,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct ceph_client *cl = fsc->client;
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
- struct iov_iter iter;
- int err = 0;
- u64 len = subreq->len;
+ int err;
+ u64 len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
int extent_cnt;
@@ -347,6 +358,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;
+ // TODO: This rounding here is slightly dodgy. It *should* work, for
+ // now, as the cache only deals in blocks that are a multiple of
+ // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to
+ // happen is for the fscrypt driving to be moved into netfslib and the
+ // data in the cache also to be stored encrypted.
+ len = subreq->len;
ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
@@ -369,8 +386,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
ceph_vinop(inode), subreq->start, subreq->len, len);
- iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
-
/*
* FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
* encrypted inodes. We'd need infrastructure that handles an iov_iter
@@ -382,7 +397,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct page **pages;
size_t page_off;
- err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
+ err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
if (err < 0) {
doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
ceph_vinop(inode), err);
@@ -397,7 +412,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
false);
} else {
- osd_req_op_extent_osd_iter(req, 0, &iter);
+ osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter);
}
if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
err = -EIO;
@@ -408,22 +423,27 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req->r_inode = inode;
ihold(inode);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
ceph_osdc_start_request(req->r_osdc, req);
out:
ceph_osdc_put_request(req);
if (err)
- netfs_subreq_terminated(subreq, err, false);
+ netfs_read_subreq_terminated(subreq, err, false);
doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}
static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
struct inode *inode = rreq->inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = ceph_inode_to_client(inode);
int got = 0, want = CEPH_CAP_FILE_CACHE;
struct ceph_netfs_request_data *priv;
int ret = 0;
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
+
if (rreq->origin != NETFS_READAHEAD)
return 0;
@@ -467,6 +487,7 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
priv->caps = got;
rreq->netfs_priv = priv;
+ rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize;
out:
if (ret < 0)
@@ -491,13 +512,18 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
+ .prepare_read = ceph_netfs_prepare_read,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
- .clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
};
#ifdef CONFIG_CEPH_FSCACHE
+static void ceph_set_page_fscache(struct page *page)
+{
+ folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
+}
+
static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
{
struct inode *inode = priv;
@@ -515,6 +541,10 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b
ceph_fscache_write_terminated, inode, true, caching);
}
#else
+static inline void ceph_set_page_fscache(struct page *page)
+{
+}
+
static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
{
}
@@ -706,6 +736,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = wlen;
set_page_writeback(page);
+ if (caching)
+ ceph_set_page_fscache(page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) {
@@ -789,6 +821,8 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
return AOP_WRITEPAGE_ACTIVATE;
}
+ folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
+
err = writepage_nounlock(page, wbc);
if (err == -ERESTARTSYS) {
/* direct memory reclaimer was killed by SIGKILL. return 0
@@ -1062,7 +1096,8 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page)) {
+ if (PageWriteback(page) ||
+ PagePrivate2(page) /* [DEPRECATED] */) {
if (wbc->sync_mode == WB_SYNC_NONE) {
doutc(cl, "%p under writeback\n", page);
unlock_page(page);
@@ -1070,6 +1105,7 @@ get_more_pages:
}
doutc(cl, "waiting on writeback %p\n", page);
wait_on_page_writeback(page);
+ folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
}
if (!clear_page_dirty_for_io(page)) {
@@ -1254,6 +1290,8 @@ new_request:
}
set_page_writeback(page);
+ if (caching)
+ ceph_set_page_fscache(page);
len += thp_size(page);
}
ceph_fscache_write_to_cache(inode, offset, len, caching);
@@ -1486,20 +1524,18 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
*/
static int ceph_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct folio *folio = NULL;
int r;
- r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
+ r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL);
if (r < 0)
return r;
- folio_wait_private_2(folio); /* [DEPRECATED] */
- WARN_ON_ONCE(!folio_test_locked(folio));
- *pagep = &folio->page;
+ folio_wait_private_2(*foliop); /* [DEPRECATED] */
+ WARN_ON_ONCE(!folio_test_locked(*foliop));
return 0;
}
@@ -1509,9 +1545,8 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
*/
static int ceph_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *subpage, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(subpage);
struct inode *inode = file_inode(file);
struct ceph_client *cl = ceph_inode_to_client(inode);
bool check_cap = false;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 18c72b305858..ddec8c9244ee 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -707,7 +707,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
if (offset != file->f_pos) {
file->f_pos = offset;
- file->f_version = 0;
dfi->file_info.flags &= ~CEPH_F_ATEND;
}
retval = offset;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8f8de8f33abb..4a8eec46254b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
/* Set parameters for the netfs library */
netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
- /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
- __set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
spin_lock_init(&ci->i_ceph_lock);
@@ -697,6 +695,7 @@ void ceph_evict_inode(struct inode *inode)
percpu_counter_dec(&mdsc->metric.total_inodes);
+ netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
if (inode->i_state & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6898dc621011..6896fce122e1 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -119,31 +119,43 @@ static const struct fs_parameter_spec coda_param_specs[] = {
{}
};
-static int coda_parse_fd(struct fs_context *fc, int fd)
+static int coda_set_idx(struct fs_context *fc, struct file *file)
{
struct coda_fs_context *ctx = fc->fs_private;
- struct fd f;
struct inode *inode;
int idx;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- inode = file_inode(f.file);
+ inode = file_inode(file);
if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
- fdput(f);
- return invalf(fc, "code: Not coda psdev");
+ return invalf(fc, "coda: Not coda psdev");
}
-
idx = iminor(inode);
- fdput(f);
-
if (idx < 0 || idx >= MAX_CODADEVS)
return invalf(fc, "coda: Bad minor number");
ctx->idx = idx;
return 0;
}
+static int coda_parse_fd(struct fs_context *fc, struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ struct file *file;
+ int err;
+
+ if (param->type == fs_value_is_file) {
+ file = param->file;
+ param->file = NULL;
+ } else {
+ file = fget(result->uint_32);
+ }
+ if (!file)
+ return -EBADF;
+
+ err = coda_set_idx(fc, file);
+ fput(file);
+ return err;
+}
+
static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct fs_parse_result result;
@@ -155,7 +167,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (opt) {
case Opt_fd:
- return coda_parse_fd(fc, result.uint_32);
+ return coda_parse_fd(fc, param, &result);
}
return 0;
@@ -167,6 +179,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
*/
static int coda_parse_monolithic(struct fs_context *fc, void *_data)
{
+ struct file *file;
struct coda_mount_data *data = _data;
if (!data)
@@ -175,7 +188,11 @@ static int coda_parse_monolithic(struct fs_context *fc, void *_data)
if (data->version != CODA_MOUNT_VERSION)
return invalf(fc, "coda: Bad mount version");
- coda_parse_fd(fc, data->fd);
+ file = fget(data->fd);
+ if (file) {
+ coda_set_idx(fc, file);
+ fput(file);
+ }
return 0;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 7f12ff6ad1d3..53a78b6bbb5b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -18,6 +18,7 @@
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
+#include <linux/sort.h>
#include <linux/sched/coredump.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
@@ -464,7 +465,17 @@ static bool dump_interrupted(void)
* but then we need to teach dump_write() to restart and clear
* TIF_SIGPENDING.
*/
- return fatal_signal_pending(current) || freezing(current);
+ if (fatal_signal_pending(current)) {
+ coredump_report_failure("interrupted: fatal signal pending");
+ return true;
+ }
+
+ if (freezing(current)) {
+ coredump_report_failure("interrupted: freezing");
+ return true;
+ }
+
+ return false;
}
static void wait_for_dump_helpers(struct file *file)
@@ -519,7 +530,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
return err;
}
-void do_coredump(const kernel_siginfo_t *siginfo)
+int do_coredump(const kernel_siginfo_t *siginfo)
{
struct core_state core_state;
struct core_name cn;
@@ -527,7 +538,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
struct linux_binfmt * binfmt;
const struct cred *old_cred;
struct cred *cred;
- int retval = 0;
+ int retval;
int ispipe;
size_t *argv = NULL;
int argc = 0;
@@ -551,14 +562,20 @@ void do_coredump(const kernel_siginfo_t *siginfo)
audit_core_dumps(siginfo->si_signo);
binfmt = mm->binfmt;
- if (!binfmt || !binfmt->core_dump)
+ if (!binfmt || !binfmt->core_dump) {
+ retval = -ENOEXEC;
goto fail;
- if (!__get_dumpable(cprm.mm_flags))
+ }
+ if (!__get_dumpable(cprm.mm_flags)) {
+ retval = -EACCES;
goto fail;
+ }
cred = prepare_creds();
- if (!cred)
+ if (!cred) {
+ retval = -EPERM;
goto fail;
+ }
/*
* We cannot trust fsuid as being the "true" uid of the process
* nor do we know its entire history. We only know it was tainted
@@ -586,8 +603,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
struct subprocess_info *sub_info;
if (ispipe < 0) {
- printk(KERN_WARNING "format_corename failed\n");
- printk(KERN_WARNING "Aborting core\n");
+ coredump_report_failure("format_corename failed, aborting core");
+ retval = ispipe;
goto fail_unlock;
}
@@ -607,27 +624,24 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* right pid if a thread in a multi-threaded
* core_pattern process dies.
*/
- printk(KERN_WARNING
- "Process %d(%s) has RLIMIT_CORE set to 1\n",
- task_tgid_vnr(current), current->comm);
- printk(KERN_WARNING "Aborting core\n");
+ coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
+ retval = -EPERM;
goto fail_unlock;
}
cprm.limit = RLIM_INFINITY;
dump_count = atomic_inc_return(&core_dump_count);
if (core_pipe_limit && (core_pipe_limit < dump_count)) {
- printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
- task_tgid_vnr(current), current->comm);
- printk(KERN_WARNING "Skipping core dump\n");
+ coredump_report_failure("over core_pipe_limit, skipping core dump");
+ retval = -E2BIG;
goto fail_dropcount;
}
helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
GFP_KERNEL);
if (!helper_argv) {
- printk(KERN_WARNING "%s failed to allocate memory\n",
- __func__);
+ coredump_report_failure("%s failed to allocate memory", __func__);
+ retval = -ENOMEM;
goto fail_dropcount;
}
for (argi = 0; argi < argc; argi++)
@@ -644,8 +658,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
kfree(helper_argv);
if (retval) {
- printk(KERN_INFO "Core dump to |%s pipe failed\n",
- cn.corename);
+ coredump_report_failure("|%s pipe failed", cn.corename);
goto close_fail;
}
} else {
@@ -654,14 +667,16 @@ void do_coredump(const kernel_siginfo_t *siginfo)
int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
O_LARGEFILE | O_EXCL;
- if (cprm.limit < binfmt->min_coredump)
+ if (cprm.limit < binfmt->min_coredump) {
+ coredump_report_failure("over coredump resource limit, skipping core dump");
+ retval = -E2BIG;
goto fail_unlock;
+ }
if (need_suid_safe && cn.corename[0] != '/') {
- printk(KERN_WARNING "Pid %d(%s) can only dump core "\
- "to fully qualified path!\n",
- task_tgid_vnr(current), current->comm);
- printk(KERN_WARNING "Skipping core dump\n");
+ coredump_report_failure(
+ "this process can only dump core to a fully qualified path, skipping core dump");
+ retval = -EPERM;
goto fail_unlock;
}
@@ -707,20 +722,28 @@ void do_coredump(const kernel_siginfo_t *siginfo)
} else {
cprm.file = filp_open(cn.corename, open_flags, 0600);
}
- if (IS_ERR(cprm.file))
+ if (IS_ERR(cprm.file)) {
+ retval = PTR_ERR(cprm.file);
goto fail_unlock;
+ }
inode = file_inode(cprm.file);
- if (inode->i_nlink > 1)
+ if (inode->i_nlink > 1) {
+ retval = -EMLINK;
goto close_fail;
- if (d_unhashed(cprm.file->f_path.dentry))
+ }
+ if (d_unhashed(cprm.file->f_path.dentry)) {
+ retval = -EEXIST;
goto close_fail;
+ }
/*
* AK: actually i see no reason to not allow this for named
* pipes etc, but keep the previous behaviour for now.
*/
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode)) {
+ retval = -EISDIR;
goto close_fail;
+ }
/*
* Don't dump core if the filesystem changed owner or mode
* of the file during file creation. This is an issue when
@@ -730,19 +753,24 @@ void do_coredump(const kernel_siginfo_t *siginfo)
idmap = file_mnt_idmap(cprm.file);
if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode),
current_fsuid())) {
- pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
- cn.corename);
+ coredump_report_failure("Core dump to %s aborted: "
+ "cannot preserve file owner", cn.corename);
+ retval = -EPERM;
goto close_fail;
}
if ((inode->i_mode & 0677) != 0600) {
- pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n",
- cn.corename);
+ coredump_report_failure("Core dump to %s aborted: "
+ "cannot preserve file permissions", cn.corename);
+ retval = -EPERM;
goto close_fail;
}
- if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
+ if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) {
+ retval = -EACCES;
goto close_fail;
- if (do_truncate(idmap, cprm.file->f_path.dentry,
- 0, 0, cprm.file))
+ }
+ retval = do_truncate(idmap, cprm.file->f_path.dentry,
+ 0, 0, cprm.file);
+ if (retval)
goto close_fail;
}
@@ -757,11 +785,16 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* have this set to NULL.
*/
if (!cprm.file) {
- pr_info("Core dump to |%s disabled\n", cn.corename);
+ coredump_report_failure("Core dump to |%s disabled", cn.corename);
+ retval = -EPERM;
goto close_fail;
}
- if (!dump_vma_snapshot(&cprm))
+ if (!dump_vma_snapshot(&cprm)) {
+ coredump_report_failure("Can't get VMA snapshot for core dump |%s",
+ cn.corename);
+ retval = -EACCES;
goto close_fail;
+ }
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
@@ -777,9 +810,21 @@ void do_coredump(const kernel_siginfo_t *siginfo)
}
file_end_write(cprm.file);
free_vma_snapshot(&cprm);
+ } else {
+ coredump_report_failure("Core dump to %s%s has been interrupted",
+ ispipe ? "|" : "", cn.corename);
+ retval = -EAGAIN;
+ goto fail;
}
+ coredump_report(
+ "written to %s%s: VMAs: %d, size %zu; core: %lld bytes, pos %lld",
+ ispipe ? "|" : "", cn.corename,
+ cprm.vma_count, cprm.vma_data_size, cprm.written, cprm.pos);
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
+
+ retval = 0;
+
close_fail:
if (cprm.file)
filp_close(cprm.file, NULL);
@@ -794,7 +839,7 @@ fail_unlock:
fail_creds:
put_cred(cred);
fail:
- return;
+ return retval;
}
/*
@@ -814,8 +859,16 @@ static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
if (dump_interrupted())
return 0;
n = __kernel_write(file, addr, nr, &pos);
- if (n != nr)
+ if (n != nr) {
+ if (n < 0)
+ coredump_report_failure("failed when writing out, error %zd", n);
+ else
+ coredump_report_failure(
+ "partially written out, only %zd(of %d) bytes written",
+ n, nr);
+
return 0;
+ }
file->f_pos = pos;
cprm->written += n;
cprm->pos += n;
@@ -828,9 +881,16 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
if (file->f_mode & FMODE_LSEEK) {
- if (dump_interrupted() ||
- vfs_llseek(file, nr, SEEK_CUR) < 0)
+ int ret;
+
+ if (dump_interrupted())
+ return 0;
+
+ ret = vfs_llseek(file, nr, SEEK_CUR);
+ if (ret < 0) {
+ coredump_report_failure("failed when seeking, error %d", ret);
return 0;
+ }
cprm->pos += nr;
return 1;
} else {
@@ -983,11 +1043,10 @@ void validate_coredump_safety(void)
{
if (suid_dumpable == SUID_DUMP_ROOT &&
core_pattern[0] != '/' && core_pattern[0] != '|') {
- pr_warn(
-"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
-"Pipe handler or fully qualified core dump path required.\n"
-"Set kernel.core_pattern before fs.suid_dumpable.\n"
- );
+
+ coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: "
+ "pipe handler or fully qualified core dump path required. "
+ "Set kernel.core_pattern before fs.suid_dumpable.");
}
}
@@ -1191,6 +1250,18 @@ static void free_vma_snapshot(struct coredump_params *cprm)
}
}
+static int cmp_vma_size(const void *vma_meta_lhs_ptr, const void *vma_meta_rhs_ptr)
+{
+ const struct core_vma_metadata *vma_meta_lhs = vma_meta_lhs_ptr;
+ const struct core_vma_metadata *vma_meta_rhs = vma_meta_rhs_ptr;
+
+ if (vma_meta_lhs->dump_size < vma_meta_rhs->dump_size)
+ return -1;
+ if (vma_meta_lhs->dump_size > vma_meta_rhs->dump_size)
+ return 1;
+ return 0;
+}
+
/*
* Under the mmap_lock, take a snapshot of relevant information about the task's
* VMAs.
@@ -1253,5 +1324,8 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
cprm->vma_data_size += m->dump_size;
}
+ sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta),
+ cmp_vma_size, NULL);
+
return true;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 3d8daaecb6d1..0f6b16ba30d0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -96,11 +96,16 @@ EXPORT_SYMBOL(dotdot_name);
*
* This hash-function tries to avoid losing too many bits of hash
* information, yet avoid using a prime hash-size or similar.
+ *
+ * Marking the variables "used" ensures that the compiler doesn't
+ * optimize them away completely on architectures with runtime
+ * constant infrastructure, this allows debuggers to see their
+ * values. But updating these values has no effect on those arches.
*/
-static unsigned int d_hash_shift __ro_after_init;
+static unsigned int d_hash_shift __ro_after_init __used;
-static struct hlist_bl_head *dentry_hashtable __ro_after_init;
+static struct hlist_bl_head *dentry_hashtable __ro_after_init __used;
static inline struct hlist_bl_head *d_hash(unsigned long hashlen)
{
@@ -1908,8 +1913,13 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
__d_instantiate(entry, inode);
WARN_ON(!(inode->i_state & I_NEW));
inode->i_state &= ~I_NEW & ~I_CREATING;
+ /*
+ * Pairs with the barrier in prepare_to_wait_event() to make sure
+ * ___wait_var_event() either sees the bit cleared or
+ * waitqueue_active() check in wake_up_var() sees the waiter.
+ */
smp_mb();
- wake_up_bit(&inode->i_state, __I_NEW);
+ inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);
@@ -2163,9 +2173,6 @@ seqretry:
* without taking d_lock and checking d_seq sequence count against @seq
* returned here.
*
- * A refcount may be taken on the found dentry with the d_rcu_to_refcount
- * function.
- *
* Alternatively, __d_lookup_rcu may be called again to look up the child of
* the returned dentry, so long as its parent's seqlock is checked after the
* child is looked up. Thus, an interlocking stepping of sequence lock checks
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 91521576f500..66d9b3b4c588 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -89,12 +89,14 @@ enum {
Opt_uid,
Opt_gid,
Opt_mode,
+ Opt_source,
};
static const struct fs_parameter_spec debugfs_param_specs[] = {
fsparam_gid ("gid", Opt_gid),
fsparam_u32oct ("mode", Opt_mode),
fsparam_uid ("uid", Opt_uid),
+ fsparam_string ("source", Opt_source),
{}
};
@@ -126,6 +128,12 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_mode:
opts->mode = result.uint_32 & S_IALLUGO;
break;
+ case Opt_source:
+ if (fc->source)
+ return invalfc(fc, "Multiple sources specified");
+ fc->source = param->string;
+ param->string = NULL;
+ break;
/*
* We might like to report bad mount options here;
* but traditionally debugfs has ignored all mount options
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b0aafe640fa4..bbd05f1a2145 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,7 +37,6 @@
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <linux/atomic.h>
-#include <linux/prefetch.h>
#include "internal.h"
@@ -1121,11 +1120,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct blk_plug plug;
unsigned long align = offset | iov_iter_alignment(iter);
- /*
- * Avoid references to bdev if not absolutely needed to give
- * the early prefetch in the caller enough time.
- */
-
/* watch out for a 0 len io from a tricksy fs */
if (iov_iter_rw(iter) == READ && !count)
return 0;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index e2483acc4366..287e5d407f08 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -234,17 +234,17 @@ out:
/*
* Called with lower inode mutex held.
*/
-static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
+static int fill_zeros_to_end_of_page(struct folio *folio, unsigned int to)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
int end_byte_in_page;
- if ((i_size_read(inode) / PAGE_SIZE) != page->index)
+ if ((i_size_read(inode) / PAGE_SIZE) != folio->index)
goto out;
end_byte_in_page = i_size_read(inode) % PAGE_SIZE;
if (to > end_byte_in_page)
end_byte_in_page = to;
- zero_user_segment(page, end_byte_in_page, PAGE_SIZE);
+ folio_zero_segment(folio, end_byte_in_page, PAGE_SIZE);
out:
return 0;
}
@@ -255,7 +255,7 @@ out:
* @mapping: The eCryptfs object
* @pos: The file offset at which to start writing
* @len: Length of the write
- * @pagep: Pointer to return the page
+ * @foliop: Pointer to return the folio
* @fsdata: Pointer to return fs data (unused)
*
* This function must zero any hole we create
@@ -265,38 +265,39 @@ out:
static int ecryptfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
loff_t prev_page_end_size;
int rc = 0;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *foliop = folio;
prev_page_end_size = ((loff_t)index << PAGE_SHIFT);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(
- page, index, 0, PAGE_SIZE, mapping->host);
+ &folio->page, index, 0, PAGE_SIZE, mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"lower page segment; rc = [%d]\n",
__func__, rc);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
goto out;
} else
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
rc = ecryptfs_copy_up_encrypted_with_header(
- page, crypt_stat);
+ &folio->page, crypt_stat);
if (rc) {
printk(KERN_ERR "%s: Error attempting "
"to copy the encrypted content "
@@ -304,46 +305,46 @@ static int ecryptfs_write_begin(struct file *file,
"inserting the metadata from "
"the xattr into the header; rc "
"= [%d]\n", __func__, rc);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
} else {
rc = ecryptfs_read_lower_page_segment(
- page, index, 0, PAGE_SIZE,
+ &folio->page, index, 0, PAGE_SIZE,
mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error reading "
"page; rc = [%d]\n",
__func__, rc);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
} else {
if (prev_page_end_size
- >= i_size_read(page->mapping->host)) {
- zero_user(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ >= i_size_read(mapping->host)) {
+ folio_zero_range(folio, 0, PAGE_SIZE);
+ folio_mark_uptodate(folio);
} else if (len < PAGE_SIZE) {
- rc = ecryptfs_decrypt_page(page);
+ rc = ecryptfs_decrypt_page(&folio->page);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
"page at index [%ld]; "
"rc = [%d]\n",
- __func__, page->index, rc);
- ClearPageUptodate(page);
+ __func__, folio->index, rc);
+ folio_clear_uptodate(folio);
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
}
}
/* If creating a page or more of holes, zero them out via truncate.
* Note, this will increase i_size. */
if (index != 0) {
- if (prev_page_end_size > i_size_read(page->mapping->host)) {
+ if (prev_page_end_size > i_size_read(mapping->host)) {
rc = ecryptfs_truncate(file->f_path.dentry,
prev_page_end_size);
if (rc) {
@@ -359,12 +360,11 @@ static int ecryptfs_write_begin(struct file *file,
* of page? Zero it out. */
if ((i_size_read(mapping->host) == prev_page_end_size)
&& (pos != 0))
- zero_user(page, 0, PAGE_SIZE);
+ folio_zero_range(folio, 0, PAGE_SIZE);
out:
if (unlikely(rc)) {
- unlock_page(page);
- put_page(page);
- *pagep = NULL;
+ folio_unlock(folio);
+ folio_put(folio);
}
return rc;
}
@@ -457,13 +457,13 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
* @pos: The file position
* @len: The length of the data (unused)
* @copied: The amount of data copied
- * @page: The eCryptfs page
+ * @folio: The eCryptfs folio
* @fsdata: The fsdata (unused)
*/
static int ecryptfs_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
unsigned from = pos & (PAGE_SIZE - 1);
@@ -476,8 +476,8 @@ static int ecryptfs_write_end(struct file *file,
ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
- rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0,
- to);
+ rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
+ &folio->page, 0, to);
if (!rc) {
rc = copied;
fsstack_copy_inode_size(ecryptfs_inode,
@@ -485,21 +485,21 @@ static int ecryptfs_write_end(struct file *file,
}
goto out;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (copied < PAGE_SIZE) {
rc = 0;
goto out;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/* Fills in zeros if 'to' goes beyond inode size */
- rc = fill_zeros_to_end_of_page(page, to);
+ rc = fill_zeros_to_end_of_page(folio, to);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
"zeros in page with index = [0x%.16lx]\n", index);
goto out;
}
- rc = ecryptfs_encrypt_page(page);
+ rc = ecryptfs_encrypt_page(&folio->page);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
"index [0x%.16lx])\n", index);
@@ -518,8 +518,8 @@ static int ecryptfs_write_end(struct file *file,
else
rc = copied;
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return rc;
}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 7dcdce660cac..6ea60661fa55 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -74,6 +74,23 @@ config EROFS_FS_SECURITY
If you are not using a security module, say N.
+config EROFS_FS_BACKED_BY_FILE
+ bool "File-backed EROFS filesystem support"
+ depends on EROFS_FS
+ default y
+ help
+ This allows EROFS to use filesystem image files directly, without
+ the intercession of loopback block devices or likewise. It is
+ particularly useful for container images with numerous blobs and
+ other sandboxes, where loop devices behave intricately. It can also
+ be used to simplify error-prone lifetime management of unnecessary
+ virtual block devices.
+
+ Note that this feature, along with ongoing fanotify pre-content
+ hooks, will eventually replace "EROFS over fscache."
+
+ If you don't want to enable this feature, say N.
+
config EROFS_FS_ZIP
bool "EROFS Data Compression Support"
depends on EROFS_FS
@@ -128,7 +145,7 @@ config EROFS_FS_ZIP_ZSTD
If unsure, say N.
config EROFS_FS_ONDEMAND
- bool "EROFS fscache-based on-demand read support"
+ bool "EROFS fscache-based on-demand read support (deprecated)"
depends on EROFS_FS
select NETFS_SUPPORT
select FSCACHE
@@ -138,6 +155,9 @@ config EROFS_FS_ONDEMAND
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
+ It is now deprecated and scheduled to be removed from the kernel
+ after fanotify pre-content hooks are landed.
+
If unsure, say N.
config EROFS_FS_PCPU_KTHREAD
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 097d672e6b14..4331d53c7109 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
+erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 1b7eba38ba1e..61debd799cf9 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -59,8 +59,12 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
- if (erofs_is_fscache_mode(sb))
- buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ if (erofs_is_fileio_mode(sbi))
+ buf->mapping = file_inode(sbi->fdev)->i_mapping;
+ else if (erofs_is_fscache_mode(sb))
+ buf->mapping = sbi->s_fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
}
@@ -75,38 +79,28 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map)
{
- erofs_blk_t nblocks, lastblk;
- u64 offset = map->m_la;
struct erofs_inode *vi = EROFS_I(inode);
struct super_block *sb = inode->i_sb;
bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+ erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking;
- nblocks = erofs_iblks(inode);
- lastblk = nblocks - tailendpacking;
-
- /* there is no hole in flatmode */
- map->m_flags = EROFS_MAP_MAPPED;
- if (offset < erofs_pos(sb, lastblk)) {
+ map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */
+ if (map->m_la < erofs_pos(sb, lastblk)) {
map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
- map->m_plen = erofs_pos(sb, lastblk) - offset;
- } else if (tailendpacking) {
+ map->m_plen = erofs_pos(sb, lastblk) - map->m_la;
+ } else {
+ DBG_BUGON(!tailendpacking);
map->m_pa = erofs_iloc(inode) + vi->inode_isize +
- vi->xattr_isize + erofs_blkoff(sb, offset);
- map->m_plen = inode->i_size - offset;
+ vi->xattr_isize + erofs_blkoff(sb, map->m_la);
+ map->m_plen = inode->i_size - map->m_la;
/* inline data should be located in the same meta block */
if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "inline data cross block boundary @ nid %llu",
- vi->nid);
+ erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
map->m_flags |= EROFS_MAP_META;
- } else {
- erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx",
- vi->nid, inode->i_size, map->m_la);
- DBG_BUGON(1);
- return -EIO;
}
return 0;
}
@@ -128,7 +122,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
- map->m_plen = 0;
+ map->m_plen = map->m_llen;
goto out;
}
@@ -189,16 +183,34 @@ out:
return err;
}
+static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
+ struct erofs_device_info *dif)
+{
+ map->m_bdev = NULL;
+ map->m_fp = NULL;
+ if (dif->file) {
+ if (S_ISBLK(file_inode(dif->file)->i_mode))
+ map->m_bdev = file_bdev(dif->file);
+ else
+ map->m_fp = dif->file;
+ }
+ map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
+}
+
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
+ erofs_off_t startoff, length;
int id;
map->m_bdev = sb->s_bdev;
map->m_daxdev = EROFS_SB(sb)->dax_dev;
map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
map->m_fscache = EROFS_SB(sb)->s_fscache;
+ map->m_fp = EROFS_SB(sb)->fdev;
if (map->m_deviceid) {
down_read(&devs->rwsem);
@@ -212,29 +224,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
up_read(&devs->rwsem);
return 0;
}
- map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ erofs_fill_from_devinfo(map, dif);
up_read(&devs->rwsem);
} else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
- erofs_off_t startoff, length;
-
if (!dif->mapped_blkaddr)
continue;
+
startoff = erofs_pos(sb, dif->mapped_blkaddr);
length = erofs_pos(sb, dif->blocks);
-
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
- map->m_bdev = dif->bdev_file ?
- file_bdev(dif->bdev_file) : NULL;
- map->m_daxdev = dif->dax_dev;
- map->m_dax_part_off = dif->dax_part_off;
- map->m_fscache = dif->fscache;
+ erofs_fill_from_devinfo(map, dif);
break;
}
}
@@ -243,6 +246,42 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return 0;
}
+/*
+ * bit 30: I/O error occurred on this folio
+ * bit 0 - 29: remaining parts to complete this folio
+ */
+#define EROFS_ONLINEFOLIO_EIO (1 << 30)
+
+void erofs_onlinefolio_init(struct folio *folio)
+{
+ union {
+ atomic_t o;
+ void *v;
+ } u = { .o = ATOMIC_INIT(1) };
+
+ folio->private = u.v; /* valid only if file-backed folio is locked */
+}
+
+void erofs_onlinefolio_split(struct folio *folio)
+{
+ atomic_inc((atomic_t *)&folio->private);
+}
+
+void erofs_onlinefolio_end(struct folio *folio, int err)
+{
+ int orig, v;
+
+ do {
+ orig = atomic_read((atomic_t *)&folio->private);
+ v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0);
+ } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
+
+ if (v & ~EROFS_ONLINEFOLIO_EIO)
+ return;
+ folio->private = 0;
+ folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO));
+}
+
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
@@ -392,7 +431,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
/* for uncompressed (aligned) files and raw access for other files */
-const struct address_space_operations erofs_raw_access_aops = {
+const struct address_space_operations erofs_aops = {
.read_folio = erofs_read_folio,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index c2253b6a5416..eb318c7ddd80 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -539,7 +539,7 @@ int __init z_erofs_init_decompressor(void)
for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) {
err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0;
if (err) {
- while (--i)
+ while (i--)
if (z_erofs_decomp[i])
z_erofs_decomp[i]->exit();
return err;
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 2193a6710c8f..c3b90abdee37 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -8,19 +8,15 @@
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
void *dentry_blk, struct erofs_dirent *de,
- unsigned int nameoff, unsigned int maxsize)
+ unsigned int nameoff0, unsigned int maxsize)
{
- const struct erofs_dirent *end = dentry_blk + nameoff;
+ const struct erofs_dirent *end = dentry_blk + nameoff0;
while (de < end) {
- const char *de_name;
+ unsigned char d_type = fs_ftype_to_dtype(de->file_type);
+ unsigned int nameoff = le16_to_cpu(de->nameoff);
+ const char *de_name = (char *)dentry_blk + nameoff;
unsigned int de_namelen;
- unsigned char d_type;
-
- d_type = fs_ftype_to_dtype(de->file_type);
-
- nameoff = le16_to_cpu(de->nameoff);
- de_name = (char *)dentry_blk + nameoff;
/* the last dirent in the block? */
if (de + 1 >= end)
@@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = dir->i_sb;
unsigned long bsz = sb->s_blocksize;
- const size_t dirsize = i_size_read(dir);
- unsigned int i = erofs_blknr(sb, ctx->pos);
unsigned int ofs = erofs_blkoff(sb, ctx->pos);
int err = 0;
bool initial = true;
buf.mapping = dir->i_mapping;
- while (ctx->pos < dirsize) {
+ while (ctx->pos < dir->i_size) {
+ erofs_off_t dbstart = ctx->pos - ofs;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP);
+ de = erofs_bread(&buf, dbstart, EROFS_KMAP);
if (IS_ERR(de)) {
erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
- i, EROFS_I(dir)->nid);
+ erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
err = PTR_ERR(de);
break;
}
@@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
break;
}
- maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz);
-
+ maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz);
/* search dirents at the arbitrary position */
if (initial) {
initial = false;
-
ofs = roundup(ofs, sizeof(struct erofs_dirent));
- ctx->pos = erofs_pos(sb, i) + ofs;
- if (ofs >= nameoff)
- goto skip_this;
+ ctx->pos = dbstart + ofs;
}
err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
nameoff, maxsize);
if (err)
break;
-skip_this:
- ctx->pos = erofs_pos(sb, i) + maxsize;
- ++i;
+ ctx->pos = dbstart + maxsize;
ofs = 0;
}
erofs_put_metabuf(&buf);
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 6c0c270c42e1..c8f2ae845bd2 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -288,9 +288,12 @@ struct erofs_dirent {
#define EROFS_NAME_LEN 255
-/* maximum supported size of a physical compression cluster */
+/* maximum supported encoded size of a physical compressed cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
+/* maximum supported decoded size of a physical compressed cluster */
+#define Z_EROFS_PCLUSTER_MAX_DSIZE (12 * 1024 * 1024)
+
/* available compression algorithm types (for h_algorithmtype) */
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
new file mode 100644
index 000000000000..3af96b1e2c2a
--- /dev/null
+++ b/fs/erofs/fileio.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include "internal.h"
+#include <trace/events/erofs.h>
+
+struct erofs_fileio_rq {
+ struct bio_vec bvecs[BIO_MAX_VECS];
+ struct bio bio;
+ struct kiocb iocb;
+};
+
+struct erofs_fileio {
+ struct erofs_map_blocks map;
+ struct erofs_map_dev dev;
+ struct erofs_fileio_rq *rq;
+};
+
+static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
+{
+ struct erofs_fileio_rq *rq =
+ container_of(iocb, struct erofs_fileio_rq, iocb);
+ struct folio_iter fi;
+
+ if (ret > 0) {
+ if (ret != rq->bio.bi_iter.bi_size) {
+ bio_advance(&rq->bio, ret);
+ zero_fill_bio(&rq->bio);
+ }
+ ret = 0;
+ }
+ if (rq->bio.bi_end_io) {
+ rq->bio.bi_end_io(&rq->bio);
+ } else {
+ bio_for_each_folio_all(fi, &rq->bio) {
+ DBG_BUGON(folio_test_uptodate(fi.folio));
+ erofs_onlinefolio_end(fi.folio, ret);
+ }
+ }
+ bio_uninit(&rq->bio);
+ kfree(rq);
+}
+
+static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
+{
+ struct iov_iter iter;
+ int ret;
+
+ if (!rq)
+ return;
+ rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ rq->iocb.ki_ioprio = get_current_ioprio();
+ rq->iocb.ki_complete = erofs_fileio_ki_complete;
+ rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ?
+ IOCB_DIRECT : 0;
+ iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
+ rq->bio.bi_iter.bi_size);
+ ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ if (ret != -EIOCBQUEUED)
+ erofs_fileio_ki_complete(&rq->iocb, ret);
+}
+
+static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev)
+{
+ struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+ rq->iocb.ki_filp = mdev->m_fp;
+ return rq;
+}
+
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev)
+{
+ return &erofs_fileio_rq_alloc(mdev)->bio;
+}
+
+void erofs_fileio_submit_bio(struct bio *bio)
+{
+ return erofs_fileio_rq_submit(container_of(bio, struct erofs_fileio_rq,
+ bio));
+}
+
+static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
+{
+ struct inode *inode = folio_inode(folio);
+ struct erofs_map_blocks *map = &io->map;
+ unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
+ loff_t pos = folio_pos(folio), ofs;
+ struct iov_iter iter;
+ struct bio_vec bv;
+ int err = 0;
+
+ erofs_onlinefolio_init(folio);
+ while (cur < end) {
+ if (!in_range(pos + cur, map->m_la, map->m_llen)) {
+ map->m_la = pos + cur;
+ map->m_llen = end - cur;
+ err = erofs_map_blocks(inode, map);
+ if (err)
+ break;
+ }
+
+ ofs = folio_pos(folio) + cur - map->m_la;
+ len = min_t(loff_t, map->m_llen - ofs, end - cur);
+ if (map->m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *src;
+
+ src = erofs_read_metabuf(&buf, inode->i_sb,
+ map->m_pa + ofs, EROFS_KMAP);
+ if (IS_ERR(src)) {
+ err = PTR_ERR(src);
+ break;
+ }
+ bvec_set_folio(&bv, folio, len, cur);
+ iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len);
+ if (copy_to_iter(src, len, &iter) != len) {
+ erofs_put_metabuf(&buf);
+ err = -EIO;
+ break;
+ }
+ erofs_put_metabuf(&buf);
+ } else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, cur + len);
+ attached = 0;
+ } else {
+ if (io->rq && (map->m_pa + ofs != io->dev.m_pa ||
+ map->m_deviceid != io->dev.m_deviceid)) {
+io_retry:
+ erofs_fileio_rq_submit(io->rq);
+ io->rq = NULL;
+ }
+
+ if (!io->rq) {
+ io->dev = (struct erofs_map_dev) {
+ .m_pa = io->map.m_pa + ofs,
+ .m_deviceid = io->map.m_deviceid,
+ };
+ err = erofs_map_dev(inode->i_sb, &io->dev);
+ if (err)
+ break;
+ io->rq = erofs_fileio_rq_alloc(&io->dev);
+ io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9;
+ attached = 0;
+ }
+ if (!attached++)
+ erofs_onlinefolio_split(folio);
+ if (!bio_add_folio(&io->rq->bio, folio, len, cur))
+ goto io_retry;
+ io->dev.m_pa += len;
+ }
+ cur += len;
+ }
+ erofs_onlinefolio_end(folio, err);
+ return err;
+}
+
+static int erofs_fileio_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fileio io = {};
+ int err;
+
+ trace_erofs_read_folio(folio, true);
+ err = erofs_fileio_scan_folio(&io, folio);
+ erofs_fileio_rq_submit(io.rq);
+ return err;
+}
+
+static void erofs_fileio_readahead(struct readahead_control *rac)
+{
+ struct inode *inode = rac->mapping->host;
+ struct erofs_fileio io = {};
+ struct folio *folio;
+ int err;
+
+ trace_erofs_readpages(inode, readahead_index(rac),
+ readahead_count(rac), true);
+ while ((folio = readahead_folio(rac))) {
+ err = erofs_fileio_scan_folio(&io, folio);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
+ }
+ erofs_fileio_rq_submit(io.rq);
+}
+
+const struct address_space_operations erofs_fileio_aops = {
+ .read_folio = erofs_fileio_read_folio,
+ .readahead = erofs_fileio_readahead,
+};
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 43c09aae2afc..db29190656eb 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -5,11 +5,26 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
-
#include <trace/events/erofs.h>
-static void *erofs_read_inode(struct erofs_buf *buf,
- struct inode *inode, unsigned int *ofs)
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
+ unsigned int m_pofs)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ loff_t off;
+
+ m_pofs += vi->xattr_isize;
+ /* check if it cannot be handled with fast symlink scheme */
+ if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+ check_add_overflow(m_pofs, inode->i_size, &off) ||
+ off > i_blocksize(inode))
+ return 0;
+
+ inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL);
+ return inode->i_link ? 0 : -ENOMEM;
+}
+
+static int erofs_read_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -20,20 +35,21 @@ static void *erofs_read_inode(struct erofs_buf *buf,
struct erofs_inode_compact *dic;
struct erofs_inode_extended *die, *copied = NULL;
union erofs_inode_i_u iu;
- unsigned int ifmt;
- int err;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int ifmt, ofs;
+ int err = 0;
blkaddr = erofs_blknr(sb, inode_loc);
- *ofs = erofs_blkoff(sb, inode_loc);
+ ofs = erofs_blkoff(sb, inode_loc);
- kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
vi->nid, PTR_ERR(kaddr));
- return kaddr;
+ return PTR_ERR(kaddr);
}
- dic = kaddr + *ofs;
+ dic = kaddr + ofs;
ifmt = le16_to_cpu(dic->i_format);
if (ifmt & ~EROFS_I_ALL) {
erofs_err(sb, "unsupported i_format %u of nid %llu",
@@ -54,11 +70,11 @@ static void *erofs_read_inode(struct erofs_buf *buf,
case EROFS_INODE_LAYOUT_EXTENDED:
vi->inode_isize = sizeof(struct erofs_inode_extended);
/* check if the extended inode acrosses block boundary */
- if (*ofs + vi->inode_isize <= sb->s_blocksize) {
- *ofs += vi->inode_isize;
+ if (ofs + vi->inode_isize <= sb->s_blocksize) {
+ ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
} else {
- const unsigned int gotten = sb->s_blocksize - *ofs;
+ const unsigned int gotten = sb->s_blocksize - ofs;
copied = kmalloc(vi->inode_isize, GFP_KERNEL);
if (!copied) {
@@ -66,16 +82,16 @@ static void *erofs_read_inode(struct erofs_buf *buf,
goto err_out;
}
memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(buf, sb, erofs_pos(sb, blkaddr + 1),
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1),
EROFS_KMAP);
if (IS_ERR(kaddr)) {
erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
vi->nid, PTR_ERR(kaddr));
kfree(copied);
- return kaddr;
+ return PTR_ERR(kaddr);
}
- *ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, kaddr, *ofs);
+ ofs = vi->inode_isize - gotten;
+ memcpy((u8 *)copied + gotten, kaddr, ofs);
die = copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
@@ -91,11 +107,10 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_size = le64_to_cpu(die->i_size);
kfree(copied);
- copied = NULL;
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
- *ofs += vi->inode_isize;
+ ofs += vi->inode_isize;
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
@@ -115,11 +130,21 @@ static void *erofs_read_inode(struct erofs_buf *buf,
goto err_out;
}
+ if (unlikely(inode->i_size < 0)) {
+ erofs_err(sb, "negative i_size @ nid %llu", vi->nid);
+ err = -EFSCORRUPTED;
+ goto err_out;
+ }
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ if(S_ISLNK(inode->i_mode)) {
+ err = erofs_fill_symlink(inode, kaddr, ofs);
+ if (err)
+ goto err_out;
+ }
break;
case S_IFCHR:
case S_IFBLK:
@@ -165,65 +190,23 @@ static void *erofs_read_inode(struct erofs_buf *buf,
inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
else
inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
- return kaddr;
-
err_out:
- DBG_BUGON(1);
- kfree(copied);
- erofs_put_metabuf(buf);
- return ERR_PTR(err);
-}
-
-static int erofs_fill_symlink(struct inode *inode, void *kaddr,
- unsigned int m_pofs)
-{
- struct erofs_inode *vi = EROFS_I(inode);
- unsigned int bsz = i_blocksize(inode);
- char *lnk;
-
- /* if it cannot be handled with fast symlink scheme */
- if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= bsz || inode->i_size < 0) {
- inode->i_op = &erofs_symlink_iops;
- return 0;
- }
-
- lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
- if (!lnk)
- return -ENOMEM;
-
- m_pofs += vi->xattr_isize;
- /* inline symlink data shouldn't cross block boundary */
- if (m_pofs + inode->i_size > bsz) {
- kfree(lnk);
- erofs_err(inode->i_sb,
- "inline data cross block boundary @ nid %llu",
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- memcpy(lnk, kaddr + m_pofs, inode->i_size);
- lnk[inode->i_size] = '\0';
-
- inode->i_link = lnk;
- inode->i_op = &erofs_fast_symlink_iops;
- return 0;
+ DBG_BUGON(err);
+ erofs_put_metabuf(&buf);
+ return err;
}
static int erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- void *kaddr;
- unsigned int ofs;
- int err = 0;
+ int err;
trace_erofs_fill_inode(inode);
/* read inode base data from disk */
- kaddr = erofs_read_inode(&buf, inode, &ofs);
- if (IS_ERR(kaddr))
- return PTR_ERR(kaddr);
+ err = erofs_read_inode(inode);
+ if (err)
+ return err;
/* setup the new inode */
switch (inode->i_mode & S_IFMT) {
@@ -240,9 +223,10 @@ static int erofs_fill_inode(struct inode *inode)
inode_nohighmem(inode);
break;
case S_IFLNK:
- err = erofs_fill_symlink(inode, kaddr, ofs);
- if (err)
- goto out_unlock;
+ if (inode->i_link)
+ inode->i_op = &erofs_fast_symlink_iops;
+ else
+ inode->i_op = &erofs_symlink_iops;
inode_nohighmem(inode);
break;
case S_IFCHR:
@@ -251,33 +235,33 @@ static int erofs_fill_inode(struct inode *inode)
case S_IFSOCK:
inode->i_op = &erofs_generic_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
- goto out_unlock;
+ return 0;
default:
- err = -EFSCORRUPTED;
- goto out_unlock;
+ return -EFSCORRUPTED;
}
+ mapping_set_large_folios(inode->i_mapping);
if (erofs_inode_is_data_compressed(vi->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
erofs_info, inode->i_sb,
"EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
inode->i_mapping->a_ops = &z_erofs_aops;
- err = 0;
- goto out_unlock;
-#endif
+#else
err = -EOPNOTSUPP;
- goto out_unlock;
- }
- inode->i_mapping->a_ops = &erofs_raw_access_aops;
- mapping_set_large_folios(inode->i_mapping);
+#endif
+ } else {
+ inode->i_mapping->a_ops = &erofs_aops;
#ifdef CONFIG_EROFS_FS_ONDEMAND
- if (erofs_is_fscache_mode(inode->i_sb))
- inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
#endif
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb)))
+ inode->i_mapping->a_ops = &erofs_fileio_aops;
+#endif
+ }
-out_unlock:
- erofs_put_metabuf(&buf);
return err;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 736607675396..4efd578d7c62 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,7 +49,7 @@ typedef u32 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
- struct file *bdev_file;
+ struct file *file;
struct dax_device *dax_dev;
u64 dax_part_off;
@@ -130,6 +130,7 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
+ struct file *fdev;
struct inode *packed_inode;
struct erofs_dev_context *devs;
struct dax_device *dax_dev;
@@ -190,9 +191,15 @@ struct erofs_sb_info {
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->fdev;
+}
+
static inline bool erofs_is_fscache_mode(struct super_block *sb)
{
- return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) &&
+ !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev;
}
enum {
@@ -220,7 +227,7 @@ struct erofs_buf {
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits)
+#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits))
#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
@@ -365,6 +372,7 @@ struct erofs_map_dev {
struct erofs_fscache *m_fscache;
struct block_device *m_bdev;
struct dax_device *m_daxdev;
+ struct file *m_fp;
u64 m_dax_part_off;
erofs_off_t m_pa;
@@ -373,7 +381,8 @@ struct erofs_map_dev {
extern const struct super_operations erofs_sops;
-extern const struct address_space_operations erofs_raw_access_aops;
+extern const struct address_space_operations erofs_aops;
+extern const struct address_space_operations erofs_fileio_aops;
extern const struct address_space_operations z_erofs_aops;
extern const struct address_space_operations erofs_fscache_access_aops;
@@ -404,6 +413,9 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
+void erofs_onlinefolio_init(struct folio *folio);
+void erofs_onlinefolio_split(struct folio *folio);
+void erofs_onlinefolio_end(struct folio *folio, int err);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
@@ -477,6 +489,14 @@ static inline void z_erofs_exit_subsystem(void) {}
static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fileio_submit_bio(struct bio *bio);
+#else
+static inline struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fileio_submit_bio(struct bio *bio) {}
+#endif
+
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 32ce5b35e1df..666873f745da 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -10,6 +10,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/exportfs.h>
+#include <linux/backing-dev.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -108,22 +109,6 @@ static void erofs_free_inode(struct inode *inode)
kmem_cache_free(erofs_inode_cachep, vi);
}
-static bool check_layout_compatibility(struct super_block *sb,
- struct erofs_super_block *dsb)
-{
- const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
-
- EROFS_SB(sb)->feature_incompat = feature;
-
- /* check if current kernel meets all mandatory requirements */
- if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
- erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
- feature & ~EROFS_ALL_FEATURE_INCOMPAT);
- return false;
- }
- return true;
-}
-
/* read variable-sized metadata, offset will be aligned by 4-byte */
void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp)
@@ -177,7 +162,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
struct erofs_deviceslot *dis;
- struct file *bdev_file;
+ struct file *file;
dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
if (IS_ERR(dis))
@@ -199,13 +184,17 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(fscache);
dif->fscache = fscache;
} else if (!sbi->devs->flatdev) {
- bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ,
- sb->s_type, NULL);
- if (IS_ERR(bdev_file))
- return PTR_ERR(bdev_file);
- dif->bdev_file = bdev_file;
- dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file),
- &dif->dax_part_off, NULL, NULL);
+ file = erofs_is_fileio_mode(sbi) ?
+ filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
+ bdev_file_open_by_path(dif->path,
+ BLK_OPEN_READ, sb->s_type, NULL);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ dif->file = file;
+ if (!erofs_is_fileio_mode(sbi))
+ dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
+ &dif->dax_part_off, NULL, NULL);
}
dif->blocks = le32_to_cpu(dis->blocks);
@@ -279,7 +268,7 @@ static int erofs_scan_devices(struct super_block *sb,
static int erofs_read_superblock(struct super_block *sb)
{
- struct erofs_sb_info *sbi;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
void *data;
@@ -291,9 +280,7 @@ static int erofs_read_superblock(struct super_block *sb)
return PTR_ERR(data);
}
- sbi = EROFS_SB(sb);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
-
ret = -EINVAL;
if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
erofs_err(sb, "cannot find valid erofs superblock");
@@ -318,8 +305,12 @@ static int erofs_read_superblock(struct super_block *sb)
}
ret = -EINVAL;
- if (!check_layout_compatibility(sb, dsb))
+ sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat);
+ if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) {
+ erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
+ sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT);
goto out;
+ }
sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) {
@@ -362,7 +353,7 @@ static int erofs_read_superblock(struct super_block *sb)
ret = erofs_scan_devices(sb, dsb);
if (erofs_is_fscache_mode(sb))
- erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+ erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
@@ -580,15 +571,16 @@ static void erofs_set_sysfs_name(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (erofs_is_fscache_mode(sb)) {
- if (sbi->domain_id)
- super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id,
- sbi->fsid);
- else
- super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
- return;
- }
- super_set_sysfs_name_id(sb);
+ if (sbi->domain_id)
+ super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id,
+ sbi->fsid);
+ else if (sbi->fsid)
+ super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
+ else if (erofs_is_fileio_mode(sbi))
+ super_set_sysfs_name_generic(sb, "%s",
+ bdi_dev_name(sb->s_bdi));
+ else
+ super_set_sysfs_name_id(sb);
}
static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
@@ -603,14 +595,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &erofs_sops;
sbi->blkszbits = PAGE_SHIFT;
- if (erofs_is_fscache_mode(sb)) {
+ if (!sb->s_bdev) {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- err = erofs_fscache_register_fs(sb);
- if (err)
- return err;
-
+ if (erofs_is_fscache_mode(sb)) {
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+ }
err = super_setup_bdi(sb);
if (err)
return err;
@@ -658,7 +651,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_POSIXACL;
else
sb->s_flags &= ~SB_POSIXACL;
- erofs_set_sysfs_name(sb);
#ifdef CONFIG_EROFS_FS_ZIP
xa_init(&sbi->managed_pslots);
@@ -696,6 +688,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ erofs_set_sysfs_name(sb);
err = erofs_register_sysfs(sb);
if (err)
return err;
@@ -707,11 +700,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
static int erofs_fc_get_tree(struct fs_context *fc)
{
struct erofs_sb_info *sbi = fc->s_fs_info;
+ int ret;
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
- return get_tree_bdev(fc, erofs_fc_fill_super);
+ ret = get_tree_bdev(fc, erofs_fc_fill_super);
+#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
+ if (ret == -ENOTBLK) {
+ if (!fc->source)
+ return invalf(fc, "No source specified");
+ sbi->fdev = filp_open(fc->source, O_RDONLY | O_LARGEFILE, 0);
+ if (IS_ERR(sbi->fdev))
+ return PTR_ERR(sbi->fdev);
+
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+ }
+#endif
+ return ret;
}
static int erofs_fc_reconfigure(struct fs_context *fc)
@@ -741,8 +747,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->bdev_file)
- fput(dif->bdev_file);
+ if (dif->file)
+ fput(dif->file);
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
@@ -805,7 +811,7 @@ static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
+ if ((IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) || sbi->fdev)
kill_anon_super(sb);
else
kill_block_super(sb);
@@ -815,6 +821,8 @@ static void erofs_kill_sb(struct super_block *sb)
erofs_fscache_unregister_fs(sb);
kfree(sbi->fsid);
kfree(sbi->domain_id);
+ if (sbi->fdev)
+ fput(sbi->fdev);
kfree(sbi);
sb->s_fs_info = NULL;
}
@@ -917,7 +925,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = EROFS_NAME_LEN;
if (uuid_is_null(&sb->s_uuid))
- buf->f_fsid = u64_to_fsid(erofs_is_fscache_mode(sb) ? 0 :
+ buf->f_fsid = u64_to_fsid(!sb->s_bdev ? 0 :
huge_encode_dev(sb->s_bdev->bd_dev));
else
buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 435e515c0792..63cffd0fd261 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -205,34 +205,16 @@ static struct kobject erofs_feat = {
int erofs_register_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- char *name;
- char *str = NULL;
int err;
- if (erofs_is_fscache_mode(sb)) {
- if (sbi->domain_id) {
- str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id,
- sbi->fsid);
- if (!str)
- return -ENOMEM;
- name = str;
- } else {
- name = sbi->fsid;
- }
- } else {
- name = sb->s_id;
- }
sbi->s_kobj.kset = &erofs_root;
init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
- kfree(str);
- if (err)
- goto put_sb_kobj;
- return 0;
-
-put_sb_kobj:
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
+ sb->s_sysfs_name);
+ if (err) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
return err;
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 424f656cd765..8936790618c6 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -122,42 +122,6 @@ static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
return fo->mapping == MNGD_MAPPING(sbi);
}
-/*
- * bit 30: I/O error occurred on this folio
- * bit 0 - 29: remaining parts to complete this folio
- */
-#define Z_EROFS_FOLIO_EIO (1 << 30)
-
-static void z_erofs_onlinefolio_init(struct folio *folio)
-{
- union {
- atomic_t o;
- void *v;
- } u = { .o = ATOMIC_INIT(1) };
-
- folio->private = u.v; /* valid only if file-backed folio is locked */
-}
-
-static void z_erofs_onlinefolio_split(struct folio *folio)
-{
- atomic_inc((atomic_t *)&folio->private);
-}
-
-static void z_erofs_onlinefolio_end(struct folio *folio, int err)
-{
- int orig, v;
-
- do {
- orig = atomic_read((atomic_t *)&folio->private);
- v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0);
- } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
-
- if (v & ~Z_EROFS_FOLIO_EIO)
- return;
- folio->private = 0;
- folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO));
-}
-
#define Z_EROFS_ONSTACK_PAGES 32
/*
@@ -232,7 +196,8 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = erofs_allocpage(pagepool, GFP_KERNEL);
+ nextpage = __erofs_allocpage(pagepool, GFP_KERNEL,
+ true);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -965,7 +930,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
int err = 0;
tight = (bs == PAGE_SIZE);
- z_erofs_onlinefolio_init(folio);
+ erofs_onlinefolio_init(folio);
do {
if (offset + end - 1 < map->m_la ||
offset + end - 1 >= map->m_la + map->m_llen) {
@@ -1024,7 +989,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
if (err)
break;
- z_erofs_onlinefolio_split(folio);
+ erofs_onlinefolio_split(folio);
if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
f->pcl->multibases = true;
if (f->pcl->length < offset + end - map->m_la) {
@@ -1044,7 +1009,7 @@ static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
tight = (bs == PAGE_SIZE);
}
} while ((end = cur) > 0);
- z_erofs_onlinefolio_end(folio, err);
+ erofs_onlinefolio_end(folio, err);
return err;
}
@@ -1147,7 +1112,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
+ erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
list_del(p);
kfree(bvi);
}
@@ -1190,9 +1155,10 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
struct page *page = bvec->page;
- /* compressed data ought to be valid before decompressing */
- if (!page) {
- err = -EIO;
+ /* compressed data ought to be valid when decompressing */
+ if (IS_ERR(page) || !page) {
+ bvec->page = NULL; /* clear the failure reason */
+ err = page ? PTR_ERR(page) : -EIO;
continue;
}
be->compressed_pages[i] = page;
@@ -1268,8 +1234,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
.fillgaps = pcl->multibases,
- .gfp = pcl->besteffort ?
- GFP_KERNEL | __GFP_NOFAIL :
+ .gfp = pcl->besteffort ? GFP_KERNEL :
GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
@@ -1302,7 +1267,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
- z_erofs_onlinefolio_end(page_folio(page), err);
+ erofs_onlinefolio_end(page_folio(page), err);
continue;
}
if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
@@ -1333,8 +1298,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
return err;
}
-static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct page **pagepool)
+static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+ struct page **pagepool)
{
struct z_erofs_decompress_backend be = {
.sb = io->sb,
@@ -1343,6 +1308,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
};
z_erofs_next_pcluster_t owned = io->head;
+ int err = io->eio ? -EIO : 0;
while (owned != Z_EROFS_PCLUSTER_TAIL) {
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
@@ -1350,12 +1316,13 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
be.pcl = container_of(owned, struct z_erofs_pcluster, next);
owned = READ_ONCE(be.pcl->next);
- z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
+ err = z_erofs_decompress_pcluster(&be, err) ?: err;
if (z_erofs_is_inline_pcluster(be.pcl))
z_erofs_free_pcluster(be.pcl);
else
erofs_workgroup_put(&be.pcl->obj);
}
+ return err;
}
static void z_erofs_decompressqueue_work(struct work_struct *work)
@@ -1428,6 +1395,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
struct z_erofs_bvec zbv;
struct address_space *mapping;
struct folio *folio;
+ struct page *page;
int bs = i_blocksize(f->inode);
/* Except for inplace folios, the entire folio can be used for I/Os */
@@ -1450,7 +1418,6 @@ repeat:
* file-backed folios will be used instead.
*/
if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
- folio->private = 0;
tocache = true;
goto out_tocache;
}
@@ -1468,7 +1435,7 @@ repeat:
}
folio_lock(folio);
- if (folio->mapping == mc) {
+ if (likely(folio->mapping == mc)) {
/*
* The cached folio is still in managed cache but without
* a valid `->private` pcluster hint. Let's reconnect them.
@@ -1478,41 +1445,48 @@ repeat:
/* compressed_bvecs[] already takes a ref before */
folio_put(folio);
}
-
- /* no need to submit if it is already up-to-date */
- if (folio_test_uptodate(folio)) {
- folio_unlock(folio);
- bvec->bv_page = NULL;
+ if (likely(folio->private == pcl)) {
+ /* don't submit cache I/Os again if already uptodate */
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ bvec->bv_page = NULL;
+ }
+ return;
}
- return;
+ /*
+ * Already linked with another pcluster, which only appears in
+ * crafted images by fuzzers for now. But handle this anyway.
+ */
+ tocache = false; /* use temporary short-lived pages */
+ } else {
+ DBG_BUGON(1); /* referenced managed folios can't be truncated */
+ tocache = true;
}
-
- /*
- * It has been truncated, so it's unsafe to reuse this one. Let's
- * allocate a new page for compressed data.
- */
- DBG_BUGON(folio->mapping);
- tocache = true;
folio_unlock(folio);
folio_put(folio);
out_allocfolio:
- zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
+ page = __erofs_allocpage(&f->pagepool, gfp, true);
spin_lock(&pcl->obj.lockref.lock);
- if (pcl->compressed_bvecs[nr].page) {
- erofs_pagepool_add(&f->pagepool, zbv.page);
+ if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
+ if (page)
+ erofs_pagepool_add(&f->pagepool, page);
spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
- bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page;
- folio = page_folio(zbv.page);
- /* first mark it as a temporary shortlived folio (now 1 ref) */
- folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
+ pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
spin_unlock(&pcl->obj.lockref.lock);
+ bvec->bv_page = page;
+ if (!page)
+ return;
+ folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp))
+ filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
+ /* turn into a temporary shortlived folio (1 ref) */
+ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
+ }
folio_attach_private(folio, pcl);
/* drop a refcount added by allocpage (then 2 refs in total here) */
folio_put(folio);
@@ -1647,17 +1621,16 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
cur = mdev.m_pa;
end = cur + pcl->pclustersize;
do {
- z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
- if (!bvec.bv_page)
- continue;
-
+ bvec.bv_page = NULL;
if (bio && (cur != last_pa ||
bio->bi_bdev != mdev.m_bdev)) {
-io_retry:
- if (!erofs_is_fscache_mode(sb))
- submit_bio(bio);
- else
+drain_io:
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
if (memstall) {
psi_memstall_leave(&pflags);
@@ -1666,6 +1639,15 @@ io_retry:
bio = NULL;
}
+ if (!bvec.bv_page) {
+ z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+ if (!bvec.bv_page)
+ continue;
+ if (cur + bvec.bv_len > end)
+ bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
+ }
+
if (unlikely(PageWorkingset(bvec.bv_page)) &&
!memstall) {
psi_memstall_enter(&pflags);
@@ -1673,10 +1655,13 @@ io_retry:
}
if (!bio) {
- bio = erofs_is_fscache_mode(sb) ?
- erofs_fscache_bio_alloc(&mdev) :
- bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
- REQ_OP_READ, GFP_NOIO);
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ bio = erofs_fileio_bio_alloc(&mdev);
+ else if (erofs_is_fscache_mode(sb))
+ bio = erofs_fscache_bio_alloc(&mdev);
+ else
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_endio;
bio->bi_iter.bi_sector = cur >> 9;
bio->bi_private = q[JQ_SUBMIT];
@@ -1685,13 +1670,9 @@ io_retry:
++nr_bios;
}
- if (cur + bvec.bv_len > end)
- bvec.bv_len = end - cur;
- DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
- goto io_retry;
-
+ goto drain_io;
last_pa = cur + bvec.bv_len;
bypass = false;
} while ((cur += bvec.bv_len) < end);
@@ -1703,10 +1684,12 @@ io_retry:
} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
if (bio) {
- if (!erofs_is_fscache_mode(sb))
- submit_bio(bio);
- else
+ if (erofs_is_fileio_mode(EROFS_SB(sb)))
+ erofs_fileio_submit_bio(bio);
+ else if (erofs_is_fscache_mode(sb))
erofs_fscache_submit_bio(bio);
+ else
+ submit_bio(bio);
if (memstall)
psi_memstall_leave(&pflags);
}
@@ -1722,26 +1705,28 @@ io_retry:
z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
}
-static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- bool force_fg, bool ra)
+static int z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
+ unsigned int ra_folios)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
+ struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
+ bool force_fg = z_erofs_is_sync_decompress(sbi, ra_folios);
+ int err;
if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
- return;
- z_erofs_submit_queue(f, io, &force_fg, ra);
+ return 0;
+ z_erofs_submit_queue(f, io, &force_fg, !!ra_folios);
/* handle bypass queue (no i/o pclusters) immediately */
- z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
-
+ err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
if (!force_fg)
- return;
+ return err;
/* wait until all bios are completed */
wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
- z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
+ return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err;
}
/*
@@ -1803,7 +1788,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
struct inode *const inode = folio->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
int err;
@@ -1815,9 +1799,8 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
- /* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
-
+ /* if some pclusters are ready, need submit them anyway */
+ err = z_erofs_runqueue(&f, 0) ?: err;
if (err && err != -EINTR)
erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
err, folio->index, EROFS_I(inode)->nid);
@@ -1830,7 +1813,6 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct folio *head = NULL, *folio;
unsigned int nr_folios;
@@ -1860,7 +1842,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
z_erofs_pcluster_readmore(&f, rac, false);
z_erofs_pcluster_end(&f);
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
+ (void)z_erofs_runqueue(&f, nr_folios);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 403af6e31d5b..e980e29873a5 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -687,32 +687,30 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int err = 0;
trace_erofs_map_blocks_enter(inode, map, flags);
-
- /* when trying to read beyond EOF, leave it unmapped */
- if (map->m_la >= inode->i_size) {
+ if (map->m_la >= inode->i_size) { /* post-EOF unmapped extent */
map->m_llen = map->m_la + 1 - inode->i_size;
map->m_la = inode->i_size;
map->m_flags = 0;
- goto out;
- }
-
- err = z_erofs_fill_inode_lazy(inode);
- if (err)
- goto out;
-
- if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
- !vi->z_tailextent_headlcn) {
- map->m_la = 0;
- map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
- EROFS_MAP_FRAGMENT;
- goto out;
+ } else {
+ err = z_erofs_fill_inode_lazy(inode);
+ if (!err) {
+ if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
+ } else {
+ err = z_erofs_do_map_blocks(inode, map, flags);
+ }
+ }
+ if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
+ unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
+ map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE))
+ err = -EOPNOTSUPP;
+ if (err)
+ map->m_llen = 0;
}
-
- err = z_erofs_do_map_blocks(inode, map, flags);
-out:
- if (err)
- map->m_llen = 0;
trace_erofs_map_blocks_exit(inode, map, flags, err);
return err;
}
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 9b53883e5caf..37afe2024840 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -111,7 +111,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages)
out:
if (i < z_erofs_gbuf_count && tmp_pages) {
for (j = 0; j < nrpages; ++j)
- if (tmp_pages[j] && tmp_pages[j] != gbuf->pages[j])
+ if (tmp_pages[j] && (j >= gbuf->nrpages ||
+ tmp_pages[j] != gbuf->pages[j]))
__free_page(tmp_pages[j]);
kfree(tmp_pages);
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f53ca4f7fced..145f5349c612 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -420,7 +420,7 @@ static bool busy_loop_ep_timeout(unsigned long start_time,
static bool ep_busy_loop_on(struct eventpoll *ep)
{
- return !!ep->busy_poll_usecs || net_busy_loop_on();
+ return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on();
}
static bool ep_busy_loop_end(void *p, unsigned long start_time)
@@ -2200,11 +2200,6 @@ static int do_epoll_create(int flags)
error = PTR_ERR(file);
goto out_free_fd;
}
-#ifdef CONFIG_NET_RX_BUSY_POLL
- ep->busy_poll_usecs = 0;
- ep->busy_poll_budget = 0;
- ep->prefer_busy_poll = false;
-#endif
ep->file = file;
fd_install(fd, file);
return fd;
diff --git a/fs/exec.c b/fs/exec.c
index a126e3d1cacb..caae051c5a95 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -145,13 +145,11 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
goto out;
/*
- * may_open() has already checked for this, so it should be
- * impossible to trip now. But we need to be extra cautious
- * and check again at the very end too.
+ * Check do_open_execat() for an explanation.
*/
error = -EACCES;
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
- path_noexec(&file->f_path)))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
+ path_noexec(&file->f_path))
goto exit;
error = -ENOEXEC;
@@ -954,7 +952,6 @@ EXPORT_SYMBOL(transfer_args_to_stack);
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
- int err;
struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_EXEC,
@@ -971,24 +968,20 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
file = do_filp_open(fd, name, &open_exec_flags);
if (IS_ERR(file))
- goto out;
+ return file;
/*
- * may_open() has already checked for this, so it should be
- * impossible to trip now. But we need to be extra cautious
- * and check again at the very end too.
+ * In the past the regular type check was here. It moved to may_open() in
+ * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
+ * an invariant that all non-regular files error out before we get here.
*/
- err = -EACCES;
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
- path_noexec(&file->f_path)))
- goto exit;
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
+ path_noexec(&file->f_path)) {
+ fput(file);
+ return ERR_PTR(-EACCES);
+ }
-out:
return file;
-
-exit:
- fput(file);
- return ERR_PTR(err);
}
/**
@@ -1692,6 +1685,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
unsigned int mode;
vfsuid_t vfsuid;
vfsgid_t vfsgid;
+ int err;
if (!mnt_may_suid(file->f_path.mnt))
return;
@@ -1708,12 +1702,17 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
/* Be careful if suid/sgid is set */
inode_lock(inode);
- /* reload atomically mode/uid/gid now that lock held */
+ /* Atomically reload and check mode/uid/gid now that lock held. */
mode = inode->i_mode;
vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid = i_gid_into_vfsgid(idmap, inode);
+ err = inode_permission(idmap, inode, MAY_EXEC);
inode_unlock(inode);
+ /* Did the exec bit vanish out from under us? Give up. */
+ if (err)
+ return;
+
/* We ignore suid/sgid if there are no mappings for them in the ns */
if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
!vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 64c31867bc76..e19469e88000 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -535,20 +535,20 @@ static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end)
while (start < end) {
u32 zerofrom, len;
- struct page *page = NULL;
+ struct folio *folio;
zerofrom = start & (PAGE_SIZE - 1);
len = PAGE_SIZE - zerofrom;
if (start + len > end)
len = end - start;
- err = ops->write_begin(file, mapping, start, len, &page, NULL);
+ err = ops->write_begin(file, mapping, start, len, &folio, NULL);
if (err)
goto out;
- zero_user_segment(page, zerofrom, zerofrom + len);
+ folio_zero_range(folio, offset_in_folio(folio, start), len);
- err = ops->write_end(file, mapping, start, len, len, page, NULL);
+ err = ops->write_end(file, mapping, start, len, len, folio, NULL);
if (err < 0)
goto out;
start += len;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index dd894e558c91..05f0e07b01d0 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -448,12 +448,11 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
static int exfat_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned int len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- *pagep = NULL;
- ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block);
+ ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block);
if (ret < 0)
exfat_write_failed(mapping, pos+len);
@@ -463,13 +462,13 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
static int exfat_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned int len, unsigned int copied,
- struct page *pagep, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct exfat_inode_info *ei = EXFAT_I(inode);
int err;
- err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+ err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
if (ei->i_size_aligned < i_size_read(inode)) {
exfat_fs_error(inode->i_sb,
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 087457061c6e..402fecf90a44 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL);
+ block_write_end(NULL, mapping, pos, len, len, folio, NULL);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
@@ -263,7 +263,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
- bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+ bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data);
bool has_filetype;
if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
@@ -290,7 +290,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
- file->f_version = inode_query_iversion(inode);
+ *(u64 *)file->private_data = inode_query_iversion(inode);
need_revalidate = false;
}
de = (ext2_dirent *)(kaddr+offset);
@@ -434,7 +434,7 @@ int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino)
static int ext2_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- return __block_write_begin(&folio->page, pos, len, ext2_get_block);
+ return __block_write_begin(folio, pos, len, ext2_get_block);
}
static int ext2_handle_dirsync(struct inode *dir)
@@ -703,8 +703,30 @@ not_empty:
return 0;
}
+static int ext2_dir_open(struct inode *inode, struct file *file)
+{
+ file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int ext2_dir_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static loff_t ext2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return generic_llseek_cookie(file, offset, whence,
+ (u64 *)file->private_data);
+}
+
const struct file_operations ext2_dir_operations = {
- .llseek = generic_file_llseek,
+ .open = ext2_dir_open,
+ .release = ext2_dir_release,
+ .llseek = ext2_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext2_readdir,
.unlocked_ioctl = ext2_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0caa1650cee8..30f8201c155f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -916,11 +916,11 @@ static void ext2_readahead(struct readahead_control *rac)
static int
ext2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, pagep, ext2_get_block);
+ ret = block_write_begin(mapping, pos, len, foliop, ext2_get_block);
if (ret < 0)
ext2_write_failed(mapping, pos + len);
return ret;
@@ -928,11 +928,11 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
static int ext2_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
ext2_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ff4514e4626b..13196afe55ce 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -133,6 +133,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct super_block *sb = inode->i_sb;
struct buffer_head *bh = NULL;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
+ struct dir_private_info *info = file->private_data;
err = fscrypt_prepare_readdir(inode);
if (err)
@@ -229,7 +230,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the block
* to make sure. */
- if (!inode_eq_iversion(inode, file->f_version)) {
+ if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < sb->s_blocksize && i < offset; ) {
de = (struct ext4_dir_entry_2 *)
(bh->b_data + i);
@@ -249,7 +250,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset = i;
ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
| offset;
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < inode->i_size
@@ -384,6 +385,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
+ struct dir_private_info *info = file->private_data;
int dx_dir = is_dx_dir(inode);
loff_t ret, htree_max = ext4_get_htree_eof(file);
@@ -392,7 +394,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
htree_max, htree_max);
else
ret = ext4_llseek(file, offset, whence);
- file->f_version = inode_peek_iversion(inode) - 1;
+ info->cookie = inode_peek_iversion(inode) - 1;
return ret;
}
@@ -429,18 +431,15 @@ static void free_rb_tree_fname(struct rb_root *root)
*root = RB_ROOT;
}
-
-static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
- loff_t pos)
+static void ext4_htree_init_dir_info(struct file *filp, loff_t pos)
{
- struct dir_private_info *p;
-
- p = kzalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- return NULL;
- p->curr_hash = pos2maj_hash(filp, pos);
- p->curr_minor_hash = pos2min_hash(filp, pos);
- return p;
+ struct dir_private_info *p = filp->private_data;
+
+ if (is_dx_dir(file_inode(filp)) && !p->initialized) {
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
+ p->initialized = true;
+ }
}
void ext4_htree_free_dir_info(struct dir_private_info *p)
@@ -552,12 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
struct fname *fname;
int ret = 0;
- if (!info) {
- info = ext4_htree_create_dir_info(file, ctx->pos);
- if (!info)
- return -ENOMEM;
- file->private_data = info;
- }
+ ext4_htree_init_dir_info(file, ctx->pos);
if (ctx->pos == ext4_get_htree_eof(file))
return 0; /* EOF */
@@ -590,10 +584,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
* cached entries.
*/
if ((!info->curr_node) ||
- !inode_eq_iversion(inode, file->f_version)) {
+ !inode_eq_iversion(inode, info->cookie)) {
info->curr_node = NULL;
free_rb_tree_fname(&info->root);
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
ret = ext4_htree_fill_tree(file, info->curr_hash,
info->curr_minor_hash,
&info->next_hash);
@@ -664,7 +658,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
return 0;
}
+static int ext4_dir_open(struct inode *inode, struct file *file)
+{
+ struct dir_private_info *info;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+ file->private_data = info;
+ return 0;
+}
+
const struct file_operations ext4_dir_operations = {
+ .open = ext4_dir_open,
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext4_readdir,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 08acd152261e..ecc15e5f1eba 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2553,6 +2553,8 @@ struct dir_private_info {
__u32 curr_hash;
__u32 curr_minor_hash;
__u32 next_hash;
+ u64 cookie;
+ bool initialized;
};
/* calculate the first block number of the group */
@@ -3563,13 +3565,13 @@ int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep);
+ struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep,
+ struct folio **foliop,
void **fsdata);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index e7a09a99837b..edf4aa99a974 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -601,10 +601,10 @@ retry:
goto out;
if (ext4_should_dioread_nolock(inode)) {
- ret = __block_write_begin(&folio->page, from, to,
+ ret = __block_write_begin(folio, from, to,
ext4_get_block_unwritten);
} else
- ret = __block_write_begin(&folio->page, from, to, ext4_get_block);
+ ret = __block_write_begin(folio, from, to, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@@ -660,7 +660,7 @@ out_nofolio:
int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep)
+ struct folio **foliop)
{
int ret;
handle_t *handle;
@@ -708,7 +708,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
goto out;
}
- *pagep = &folio->page;
+ *foliop = folio;
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
ret = 0;
@@ -856,7 +856,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
goto out;
}
- ret = __block_write_begin(&folio->page, 0, inline_size,
+ ret = __block_write_begin(folio, 0, inline_size,
ext4_da_get_block_prep);
if (ret) {
up_read(&EXT4_I(inode)->xattr_sem);
@@ -891,7 +891,7 @@ out:
int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
- struct page **pagep,
+ struct folio **foliop,
void **fsdata)
{
int ret;
@@ -954,7 +954,7 @@ retry_journal:
goto out_release_page;
up_read(&EXT4_I(inode)->xattr_sem);
- *pagep = &folio->page;
+ *foliop = folio;
brelse(iloc.bh);
return 1;
out_release_page:
@@ -1460,6 +1460,7 @@ int ext4_read_inline_dir(struct file *file,
struct ext4_iloc iloc;
void *dir_buf = NULL;
int dotdot_offset, dotdot_size, extra_offset, extra_size;
+ struct dir_private_info *info = file->private_data;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1503,12 +1504,12 @@ int ext4_read_inline_dir(struct file *file,
extra_size = extra_offset + inline_size;
/*
- * If the version has changed since the last call to
+ * If the cookie has changed since the last call to
* readdir(2), then we might be pointing to an invalid
* dirent right now. Scan from the start of the inline
* dir to make sure.
*/
- if (!inode_eq_iversion(inode, file->f_version)) {
+ if (!inode_eq_iversion(inode, info->cookie)) {
for (i = 0; i < extra_size && i < offset;) {
/*
* "." is with offset 0 and
@@ -1540,7 +1541,7 @@ int ext4_read_inline_dir(struct file *file,
}
offset = i;
ctx->pos = offset;
- file->f_version = inode_query_iversion(inode);
+ info->cookie = inode_query_iversion(inode);
}
while (ctx->pos < extra_size) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 941c1c0d5c6e..03374dc215d1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1145,7 +1145,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
*/
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
int ret, needed_blocks;
@@ -1170,7 +1170,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
- pagep);
+ foliop);
if (ret < 0)
return ret;
if (ret == 1)
@@ -1224,10 +1224,10 @@ retry_journal:
ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
#else
if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(&folio->page, pos, len,
+ ret = __block_write_begin(folio, pos, len,
ext4_get_block_unwritten);
else
- ret = __block_write_begin(&folio->page, pos, len, ext4_get_block);
+ ret = __block_write_begin(folio, pos, len, ext4_get_block);
#endif
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@@ -1270,7 +1270,7 @@ retry_journal:
folio_put(folio);
return ret;
}
- *pagep = &folio->page;
+ *foliop = folio;
return ret;
}
@@ -1298,9 +1298,8 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
static int ext4_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
@@ -1315,7 +1314,7 @@ static int ext4_write_end(struct file *file,
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
/*
* it's important to update i_size while still holding folio lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1402,9 +1401,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
static int ext4_journalled_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
@@ -2926,7 +2924,7 @@ static int ext4_nonda_switch(struct super_block *sb)
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret, retries = 0;
struct folio *folio;
@@ -2941,14 +2939,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
- len, pagep, fsdata);
+ len, foliop, fsdata);
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
- pagep, fsdata);
+ foliop, fsdata);
if (ret < 0)
return ret;
if (ret == 1)
@@ -2964,7 +2962,7 @@ retry:
#ifdef CONFIG_FS_ENCRYPTION
ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
#else
- ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);
+ ret = __block_write_begin(folio, pos, len, ext4_da_get_block_prep);
#endif
if (ret < 0) {
folio_unlock(folio);
@@ -2983,7 +2981,7 @@ retry:
return ret;
}
- *pagep = &folio->page;
+ *foliop = folio;
return ret;
}
@@ -3029,7 +3027,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
* flag, which all that's needed to trigger page writeback.
*/
copied = block_write_end(NULL, mapping, pos, len, copied,
- &folio->page, NULL);
+ folio, NULL);
new_i_size = pos + copied;
/*
@@ -3080,15 +3078,14 @@ static int ext4_da_do_write_end(struct address_space *mapping,
static int ext4_da_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int write_mode = (int)(unsigned long)fsdata;
- struct folio *folio = page_folio(page);
if (write_mode == FALL_BACK_TO_NONDELALLOC)
return ext4_write_end(file, mapping, pos,
- len, copied, &folio->page, fsdata);
+ len, copied, folio, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
@@ -6219,7 +6216,7 @@ retry_alloc:
if (folio_pos(folio) + len > size)
len = size - folio_pos(folio);
- err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
+ err = __block_write_begin(folio, 0, len, ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
if (ext4_journal_folio_buffers(handle, folio, len))
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 2f37e1ea3955..d9203228ce97 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -76,17 +76,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
while (count) {
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
int res;
- res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata);
if (res)
return res;
- memcpy_to_page(page, offset_in_page(pos), buf, n);
+ memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n);
- res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata);
if (res < 0)
return res;
if (res != n)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 6457e5bca9c9..5dfa0207ad8f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3552,12 +3552,12 @@ reserve_block:
}
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *page = NULL;
- pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
+ struct folio *folio;
+ pgoff_t index = pos >> PAGE_SHIFT;
bool need_balance = false;
bool use_cow = false;
block_t blkaddr = NULL_ADDR;
@@ -3573,7 +3573,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
- * lock_page(page #0) -> lock_page(inode_page)
+ * folio_lock(folio #0) -> folio_lock(inode_page)
*/
if (index != 0) {
err = f2fs_convert_inline_inode(inode);
@@ -3584,18 +3584,20 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
int ret;
+ struct page *page;
*fsdata = NULL;
if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode)))
goto repeat;
- ret = f2fs_prepare_compress_overwrite(inode, pagep,
+ ret = f2fs_prepare_compress_overwrite(inode, &page,
index, fsdata);
if (ret < 0) {
err = ret;
goto fail;
} else if (ret) {
+ *foliop = page_folio(page);
return 0;
}
}
@@ -3603,81 +3605,85 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
repeat:
/*
- * Do not use grab_cache_page_write_begin() to avoid deadlock due to
- * wait_for_stable_page. Will wait that below with our IO control.
+ * Do not use FGP_STABLE to avoid deadlock.
+ * Will wait that below with our IO control.
*/
- page = f2fs_pagecache_get_page(mapping, index,
+ folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
- if (!page) {
- err = -ENOMEM;
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
goto fail;
}
/* TODO: cluster can be compressed due to race with .writepage */
- *pagep = page;
+ *foliop = folio;
if (f2fs_is_atomic_file(inode))
- err = prepare_atomic_write_begin(sbi, page, pos, len,
+ err = prepare_atomic_write_begin(sbi, &folio->page, pos, len,
&blkaddr, &need_balance, &use_cow);
else
- err = prepare_write_begin(sbi, page, pos, len,
+ err = prepare_write_begin(sbi, &folio->page, pos, len,
&blkaddr, &need_balance);
if (err)
- goto fail;
+ goto put_folio;
if (need_balance && !IS_NOQUOTA(inode) &&
has_not_enough_free_secs(sbi, 0, 0)) {
- unlock_page(page);
+ folio_unlock(folio);
f2fs_balance_fs(sbi, true);
- lock_page(page);
- if (page->mapping != mapping) {
- /* The page got truncated from under us */
- f2fs_put_page(page, 1);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ /* The folio got truncated from under us */
+ folio_unlock(folio);
+ folio_put(folio);
goto repeat;
}
}
- f2fs_wait_on_page_writeback(page, DATA, false, true);
+ f2fs_wait_on_page_writeback(&folio->page, DATA, false, true);
- if (len == PAGE_SIZE || PageUptodate(page))
+ if (len == folio_size(folio) || folio_test_uptodate(folio))
return 0;
if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) &&
!f2fs_verity_in_progress(inode)) {
- zero_user_segment(page, len, PAGE_SIZE);
+ folio_zero_segment(folio, len, PAGE_SIZE);
return 0;
}
if (blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
} else {
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
err = -EFSCORRUPTED;
- goto fail;
+ goto put_folio;
}
err = f2fs_submit_page_read(use_cow ?
- F2FS_I(inode)->cow_inode : inode, page,
+ F2FS_I(inode)->cow_inode : inode, &folio->page,
blkaddr, 0, true);
if (err)
- goto fail;
+ goto put_folio;
- lock_page(page);
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
+ folio_lock(folio);
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto repeat;
}
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
err = -EIO;
- goto fail;
+ goto put_folio;
}
}
return 0;
+put_folio:
+ folio_unlock(folio);
+ folio_put(folio);
fail:
- f2fs_put_page(page, 1);
f2fs_write_failed(inode, pos + len);
return err;
}
@@ -3685,9 +3691,9 @@ fail:
static int f2fs_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
trace_f2fs_write_end(inode, pos, len, copied);
@@ -3696,17 +3702,17 @@ static int f2fs_write_end(struct file *file,
* should be PAGE_SIZE. Otherwise, we treat it with zero copied and
* let generic_perform_write() try to copy data again through copied=0.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (unlikely(copied != len))
copied = 0;
else
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
/* overwrite compressed file */
if (f2fs_compressed_file(inode) && fsdata) {
- f2fs_compress_write_end(inode, fsdata, page->index, copied);
+ f2fs_compress_write_end(inode, fsdata, folio->index, copied);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
if (pos + copied > i_size_read(inode) &&
@@ -3719,7 +3725,7 @@ static int f2fs_write_end(struct file *file,
if (!copied)
goto unlock_out;
- set_page_dirty(page);
+ folio_mark_dirty(folio);
if (pos + copied > i_size_read(inode) &&
!f2fs_verity_in_progress(inode)) {
@@ -3729,7 +3735,8 @@ static int f2fs_write_end(struct file *file,
pos + copied);
}
unlock_out:
- f2fs_put_page(page, 1);
+ folio_unlock(folio);
+ folio_put(folio);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3959fd137cc9..176b5177c89d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2677,7 +2677,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
const struct address_space_operations *a_ops = mapping->a_ops;
int offset = off & (sb->s_blocksize - 1);
size_t towrite = len;
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
int err = 0;
int tocopy;
@@ -2687,7 +2687,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
towrite);
retry:
err = a_ops->write_begin(NULL, mapping, off, tocopy,
- &page, &fsdata);
+ &folio, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
@@ -2697,10 +2697,10 @@ retry:
break;
}
- memcpy_to_page(page, offset, data, tocopy);
+ memcpy_to_folio(folio, offset_in_folio(folio, off), data, tocopy);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
- page, fsdata);
+ folio, fsdata);
offset = 0;
towrite -= tocopy;
off += tocopy;
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index f7bb0c54502c..84a33fe49bed 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -80,17 +80,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
while (count) {
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
int res;
- res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
+ res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata);
if (res)
return res;
- memcpy_to_page(page, offset_in_page(pos), buf, n);
+ memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n);
- res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
+ res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata);
if (res < 0)
return res;
if (res != n)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 19115fd2d2a4..75722bbd6b5f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -221,13 +221,12 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
static int fat_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int err;
- *pagep = NULL;
err = cont_write_begin(file, mapping, pos, len,
- pagep, fsdata, fat_get_block,
+ foliop, fsdata, fat_get_block,
&MSDOS_I(mapping->host)->mmu_private);
if (err < 0)
fat_write_failed(mapping, pos + len);
@@ -236,11 +235,11 @@ static int fat_write_begin(struct file *file, struct address_space *mapping,
static int fat_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *pagep, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int err;
- err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+ err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
if (err < len)
fat_write_failed(mapping, pos + len);
if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 300e5d9ad913..081e5e3d89ea 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -33,6 +33,8 @@
#include <asm/siginfo.h>
#include <linux/uaccess.h>
+#include "internal.h"
+
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
static int setfl(int fd, struct file * filp, unsigned int arg)
@@ -87,29 +89,65 @@ static int setfl(int fd, struct file * filp, unsigned int arg)
return error;
}
-static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
- int force)
+/*
+ * Allocate an file->f_owner struct if it doesn't exist, handling racing
+ * allocations correctly.
+ */
+int file_f_owner_allocate(struct file *file)
{
- write_lock_irq(&filp->f_owner.lock);
- if (force || !filp->f_owner.pid) {
- put_pid(filp->f_owner.pid);
- filp->f_owner.pid = get_pid(pid);
- filp->f_owner.pid_type = type;
+ struct fown_struct *f_owner;
- if (pid) {
- const struct cred *cred = current_cred();
- filp->f_owner.uid = cred->uid;
- filp->f_owner.euid = cred->euid;
- }
+ f_owner = file_f_owner(file);
+ if (f_owner)
+ return 0;
+
+ f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
+ if (!f_owner)
+ return -ENOMEM;
+
+ rwlock_init(&f_owner->lock);
+ f_owner->file = file;
+ /* If someone else raced us, drop our allocation. */
+ if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
+ kfree(f_owner);
+ return 0;
+}
+EXPORT_SYMBOL(file_f_owner_allocate);
+
+void file_f_owner_release(struct file *file)
+{
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(file);
+ if (f_owner) {
+ put_pid(f_owner->pid);
+ kfree(f_owner);
}
- write_unlock_irq(&filp->f_owner.lock);
}
void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
- security_file_set_fowner(filp);
- f_modown(filp, pid, type, force);
+ struct fown_struct *f_owner;
+
+ f_owner = file_f_owner(filp);
+ if (WARN_ON_ONCE(!f_owner))
+ return;
+
+ write_lock_irq(&f_owner->lock);
+ if (force || !f_owner->pid) {
+ put_pid(f_owner->pid);
+ f_owner->pid = get_pid(pid);
+ f_owner->pid_type = type;
+
+ if (pid) {
+ const struct cred *cred = current_cred();
+ security_file_set_fowner(filp);
+ f_owner->uid = cred->uid;
+ f_owner->euid = cred->euid;
+ }
+ }
+ write_unlock_irq(&f_owner->lock);
}
EXPORT_SYMBOL(__f_setown);
@@ -119,6 +157,8 @@ int f_setown(struct file *filp, int who, int force)
struct pid *pid = NULL;
int ret = 0;
+ might_sleep();
+
type = PIDTYPE_TGID;
if (who < 0) {
/* avoid overflow below */
@@ -129,6 +169,10 @@ int f_setown(struct file *filp, int who, int force)
who = -who;
}
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+
rcu_read_lock();
if (who) {
pid = find_vpid(who);
@@ -146,22 +190,27 @@ EXPORT_SYMBOL(f_setown);
void f_delown(struct file *filp)
{
- f_modown(filp, NULL, PIDTYPE_TGID, 1);
+ __f_setown(filp, NULL, PIDTYPE_TGID, 1);
}
pid_t f_getown(struct file *filp)
{
pid_t pid = 0;
+ struct fown_struct *f_owner;
- read_lock_irq(&filp->f_owner.lock);
+ f_owner = file_f_owner(filp);
+ if (!f_owner)
+ return pid;
+
+ read_lock_irq(&f_owner->lock);
rcu_read_lock();
- if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
- pid = pid_vnr(filp->f_owner.pid);
- if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ if (pid_task(f_owner->pid, f_owner->pid_type)) {
+ pid = pid_vnr(f_owner->pid);
+ if (f_owner->pid_type == PIDTYPE_PGID)
pid = -pid;
}
rcu_read_unlock();
- read_unlock_irq(&filp->f_owner.lock);
+ read_unlock_irq(&f_owner->lock);
return pid;
}
@@ -194,6 +243,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
return -EINVAL;
}
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+
rcu_read_lock();
pid = find_vpid(owner.pid);
if (owner.pid && !pid)
@@ -210,13 +263,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
struct f_owner_ex __user *owner_p = (void __user *)arg;
struct f_owner_ex owner = {};
int ret = 0;
+ struct fown_struct *f_owner;
+ enum pid_type pid_type = PIDTYPE_PID;
- read_lock_irq(&filp->f_owner.lock);
- rcu_read_lock();
- if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
- owner.pid = pid_vnr(filp->f_owner.pid);
- rcu_read_unlock();
- switch (filp->f_owner.pid_type) {
+ f_owner = file_f_owner(filp);
+ if (f_owner) {
+ read_lock_irq(&f_owner->lock);
+ rcu_read_lock();
+ if (pid_task(f_owner->pid, f_owner->pid_type))
+ owner.pid = pid_vnr(f_owner->pid);
+ rcu_read_unlock();
+ pid_type = f_owner->pid_type;
+ }
+
+ switch (pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
break;
@@ -234,7 +294,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
ret = -EINVAL;
break;
}
- read_unlock_irq(&filp->f_owner.lock);
+ if (f_owner)
+ read_unlock_irq(&f_owner->lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
@@ -248,14 +309,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
struct user_namespace *user_ns = current_user_ns();
+ struct fown_struct *f_owner;
uid_t __user *dst = (void __user *)arg;
- uid_t src[2];
+ uid_t src[2] = {0, 0};
int err;
- read_lock_irq(&filp->f_owner.lock);
- src[0] = from_kuid(user_ns, filp->f_owner.uid);
- src[1] = from_kuid(user_ns, filp->f_owner.euid);
- read_unlock_irq(&filp->f_owner.lock);
+ f_owner = file_f_owner(filp);
+ if (f_owner) {
+ read_lock_irq(&f_owner->lock);
+ src[0] = from_kuid(user_ns, f_owner->uid);
+ src[1] = from_kuid(user_ns, f_owner->euid);
+ read_unlock_irq(&f_owner->lock);
+ }
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
@@ -343,6 +408,36 @@ static long f_dupfd_query(int fd, struct file *filp)
return f.file == filp;
}
+/* Let the caller figure out whether a given file was just created. */
+static long f_created_query(const struct file *filp)
+{
+ return !!(filp->f_mode & FMODE_CREATED);
+}
+
+static int f_owner_sig(struct file *filp, int signum, bool setsig)
+{
+ int ret = 0;
+ struct fown_struct *f_owner;
+
+ might_sleep();
+
+ if (setsig) {
+ if (!valid_signal(signum))
+ return -EINVAL;
+
+ ret = file_f_owner_allocate(filp);
+ if (ret)
+ return ret;
+ }
+
+ f_owner = file_f_owner(filp);
+ if (setsig)
+ f_owner->signum = signum;
+ else if (f_owner)
+ ret = f_owner->signum;
+ return ret;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -352,6 +447,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
long err = -EINVAL;
switch (cmd) {
+ case F_CREATED_QUERY:
+ err = f_created_query(filp);
+ break;
case F_DUPFD:
err = f_dupfd(argi, filp, 0);
break;
@@ -421,15 +519,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = f_getowner_uids(filp, arg);
break;
case F_GETSIG:
- err = filp->f_owner.signum;
+ err = f_owner_sig(filp, 0, false);
break;
case F_SETSIG:
- /* arg == 0 restores default behaviour. */
- if (!valid_signal(argi)) {
- break;
- }
- err = 0;
- filp->f_owner.signum = argi;
+ err = f_owner_sig(filp, argi, true);
break;
case F_GETLEASE:
err = fcntl_getlease(filp);
@@ -463,6 +556,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
static int check_fcntl_cmd(unsigned cmd)
{
switch (cmd) {
+ case F_CREATED_QUERY:
case F_DUPFD:
case F_DUPFD_CLOEXEC:
case F_DUPFD_QUERY:
@@ -844,14 +938,19 @@ static void send_sigurg_to_task(struct task_struct *p,
do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}
-int send_sigurg(struct fown_struct *fown)
+int send_sigurg(struct file *file)
{
+ struct fown_struct *fown;
struct task_struct *p;
enum pid_type type;
struct pid *pid;
unsigned long flags;
int ret = 0;
+ fown = file_f_owner(file);
+ if (!fown)
+ return 0;
+
read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
@@ -1027,13 +1126,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
}
read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
- fown = &fa->fa_file->f_owner;
+ fown = file_f_owner(fa->fa_file);
+ if (!fown)
+ goto next;
/* Don't send SIGURG to processes which have not set a
queued signum: SIGURG has its own default signalling
mechanism. */
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
}
+next:
read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6e8cea16790e..8cb665629f4a 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -16,7 +16,8 @@
static long do_sys_name_to_handle(const struct path *path,
struct file_handle __user *ufh,
- int __user *mnt_id, int fh_flags)
+ void __user *mnt_id, bool unique_mntid,
+ int fh_flags)
{
long retval;
struct file_handle f_handle;
@@ -69,9 +70,19 @@ static long do_sys_name_to_handle(const struct path *path,
} else
retval = 0;
/* copy the mount id */
- if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
- copy_to_user(ufh, handle,
- struct_size(handle, f_handle, handle_bytes)))
+ if (unique_mntid) {
+ if (put_user(real_mount(path->mnt)->mnt_id_unique,
+ (u64 __user *) mnt_id))
+ retval = -EFAULT;
+ } else {
+ if (put_user(real_mount(path->mnt)->mnt_id,
+ (int __user *) mnt_id))
+ retval = -EFAULT;
+ }
+ /* copy the handle */
+ if (retval != -EFAULT &&
+ copy_to_user(ufh, handle,
+ struct_size(handle, f_handle, handle_bytes)))
retval = -EFAULT;
kfree(handle);
return retval;
@@ -83,6 +94,7 @@ static long do_sys_name_to_handle(const struct path *path,
* @name: name that should be converted to handle.
* @handle: resulting file handle
* @mnt_id: mount id of the file system containing the file
+ * (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int)
* @flag: flag value to indicate whether to follow symlink or not
* and whether a decodable file handle is required.
*
@@ -92,7 +104,7 @@ static long do_sys_name_to_handle(const struct path *path,
* value required.
*/
SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
- struct file_handle __user *, handle, int __user *, mnt_id,
+ struct file_handle __user *, handle, void __user *, mnt_id,
int, flag)
{
struct path path;
@@ -100,7 +112,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
int fh_flags;
int err;
- if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
+ if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
+ AT_HANDLE_MNT_ID_UNIQUE))
return -EINVAL;
lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
@@ -109,7 +122,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
lookup_flags |= LOOKUP_EMPTY;
err = user_path_at(dfd, name, lookup_flags, &path);
if (!err) {
- err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
+ err = do_sys_name_to_handle(&path, handle, mnt_id,
+ flag & AT_HANDLE_MNT_ID_UNIQUE,
+ fh_flags);
path_put(&path);
}
return err;
diff --git a/fs/file.c b/fs/file.c
index a11e59b5d602..976ecd4ce2c6 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -46,27 +46,23 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
+#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
* Copy 'count' fd bits from the old table to the new table and clear the extra
* space if any. This does not copy the file pointers. Called with the files
* spinlock held for write.
*/
-static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
- unsigned int count)
+static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+ unsigned int copy_words)
{
- unsigned int cpy, set;
-
- cpy = count / BITS_PER_BYTE;
- set = (nfdt->max_fds - count) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)nfdt->open_fds + cpy, 0, set);
- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- memset((char *)nfdt->close_on_exec + cpy, 0, set);
-
- cpy = BITBIT_SIZE(count);
- set = BITBIT_SIZE(nfdt->max_fds) - cpy;
- memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
- memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+ unsigned int nwords = fdt_words(nfdt);
+
+ bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
+ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
+ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
+ copy_words, nwords);
}
/*
@@ -84,7 +80,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
memcpy(nfdt->fd, ofdt->fd, cpy);
memset((char *)nfdt->fd + cpy, 0, set);
- copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+ copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}
/*
@@ -379,7 +375,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
open_files = sane_fdtable_size(old_fdt, max_fds);
}
- copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+ copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
@@ -676,7 +672,7 @@ int close_fd(unsigned fd)
return filp_close(file, files);
}
-EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd);
/**
* last_fd - return last valid index into fd table
diff --git a/fs/file_table.c b/fs/file_table.c
index ca7843dde56d..eed5ffad9997 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -136,6 +136,7 @@ static int __init init_fs_stat_sysctls(void)
register_sysctl_init("fs", fs_stat_sysctls);
if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
struct ctl_table_header *hdr;
+
hdr = register_sysctl_mount_point("fs/binfmt_misc");
kmemleak_not_leak(hdr);
}
@@ -155,8 +156,14 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
return error;
}
- rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
+ /*
+ * Note that f_pos_lock is only used for files raising
+ * FMODE_ATOMIC_POS and directories. Other files such as pipes
+ * don't need it and since f_pos_lock is in a union may reuse
+ * the space for other purposes. They are expected to initialize
+ * the respective member when opening the file.
+ */
mutex_init(&f->f_pos_lock);
f->f_flags = flags;
f->f_mode = OPEN_FMODE(flags);
@@ -383,7 +390,9 @@ EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
struct file *alloc_file_clone(struct file *base, int flags,
const struct file_operations *fops)
{
- struct file *f = alloc_file(&base->f_path, flags, fops);
+ struct file *f;
+
+ f = alloc_file(&base->f_path, flags, fops);
if (!IS_ERR(f)) {
path_get(&f->f_path);
f->f_mapping = base->f_mapping;
@@ -425,7 +434,7 @@ static void __fput(struct file *file)
cdev_put(inode->i_cdev);
}
fops_put(file->f_op);
- put_pid(file->f_owner.pid);
+ file_f_owner_release(file);
put_file_access(file);
dput(dentry);
if (unlikely(mode & FMODE_NEED_UNMOUNT))
@@ -512,9 +521,14 @@ EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
- filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct file, f_freeptr),
+ };
+
+ filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b865a3fa52f3..d8bec3c1bb1f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1132,6 +1132,7 @@ out_bdi_put:
/**
* cgroup_writeback_umount - flush inode wb switches for umount
+ * @sb: target super_block
*
* This function is called when a super_block is about to be destroyed and
* flushes in-flight inode wb switches. An inode wb switch goes through
@@ -1140,8 +1141,12 @@ out_bdi_put:
* rare occurrences and synchronize_rcu() can take a while, perform
* flushing iff wb switches are in flight.
*/
-void cgroup_writeback_umount(void)
+void cgroup_writeback_umount(struct super_block *sb)
{
+
+ if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK))
+ return;
+
/*
* SB_ACTIVE should be reliably cleared before checking
* isw_nr_in_flight, see generic_shutdown_super().
@@ -1381,12 +1386,13 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
static void inode_sync_complete(struct inode *inode)
{
+ assert_spin_locked(&inode->i_lock);
+
inode->i_state &= ~I_SYNC;
/* If inode is clean an unused, put it into LRU now... */
inode_add_lru(inode);
- /* Waiters must see I_SYNC cleared before being woken up */
- smp_mb();
- wake_up_bit(&inode->i_state, __I_SYNC);
+ /* Called with inode->i_lock which ensures memory ordering. */
+ inode_wake_up_bit(inode, __I_SYNC);
}
static bool inode_dirtied_after(struct inode *inode, unsigned long t)
@@ -1505,30 +1511,27 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
* Wait for writeback on an inode to complete. Called with i_lock held.
* Caller must make sure inode cannot go away when we drop i_lock.
*/
-static void __inode_wait_for_writeback(struct inode *inode)
- __releases(inode->i_lock)
- __acquires(inode->i_lock)
+void inode_wait_for_writeback(struct inode *inode)
{
- DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
- wait_queue_head_t *wqh;
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
+
+ assert_spin_locked(&inode->i_lock);
- wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
- while (inode->i_state & I_SYNC) {
+ if (!(inode->i_state & I_SYNC))
+ return;
+
+ wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
+ for (;;) {
+ prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+ /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
+ if (!(inode->i_state & I_SYNC))
+ break;
spin_unlock(&inode->i_lock);
- __wait_on_bit(wqh, &wq, bit_wait,
- TASK_UNINTERRUPTIBLE);
+ schedule();
spin_lock(&inode->i_lock);
}
-}
-
-/*
- * Wait for writeback on an inode to complete. Caller must have inode pinned.
- */
-void inode_wait_for_writeback(struct inode *inode)
-{
- spin_lock(&inode->i_lock);
- __inode_wait_for_writeback(inode);
- spin_unlock(&inode->i_lock);
+ finish_wait(wq_head, &wqe.wq_entry);
}
/*
@@ -1539,16 +1542,20 @@ void inode_wait_for_writeback(struct inode *inode)
static void inode_sleep_on_writeback(struct inode *inode)
__releases(inode->i_lock)
{
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
- int sleep;
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
+ bool sleep;
+
+ assert_spin_locked(&inode->i_lock);
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- sleep = inode->i_state & I_SYNC;
+ wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
+ prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+ /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
+ sleep = !!(inode->i_state & I_SYNC);
spin_unlock(&inode->i_lock);
if (sleep)
schedule();
- finish_wait(wqh, &wait);
+ finish_wait(wq_head, &wqe.wq_entry);
}
/*
@@ -1752,7 +1759,7 @@ static int writeback_single_inode(struct inode *inode,
*/
if (wbc->sync_mode != WB_SYNC_ALL)
goto out;
- __inode_wait_for_writeback(inode);
+ inode_wait_for_writeback(inode);
}
WARN_ON(inode->i_state & I_SYNC);
/*
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9eb191b5c4de..f0c9cd1a0b39 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -31,6 +31,8 @@ MODULE_ALIAS("devname:fuse");
static struct kmem_cache *fuse_req_cachep;
+static void end_requests(struct list_head *head);
+
static struct fuse_dev *fuse_get_dev(struct file *file)
{
/*
@@ -773,7 +775,6 @@ static int fuse_check_folio(struct folio *folio)
(folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
- 1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
1 << PG_workingset |
@@ -818,9 +819,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
newfolio = page_folio(buf->page);
- if (!folio_test_uptodate(newfolio))
- folio_mark_uptodate(newfolio);
-
+ folio_clear_uptodate(newfolio);
folio_clear_mappedtodisk(newfolio);
if (fuse_check_folio(newfolio) != 0)
@@ -1618,9 +1617,11 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
this_num = min_t(unsigned, num, PAGE_SIZE - offset);
err = fuse_copy_page(cs, &page, offset, this_num, 0);
- if (!err && offset == 0 &&
- (this_num == PAGE_SIZE || file_size == end))
+ if (!PageUptodate(page) && !err && offset == 0 &&
+ (this_num == PAGE_SIZE || file_size == end)) {
+ zero_user_segment(page, this_num, PAGE_SIZE);
SetPageUptodate(page);
+ }
unlock_page(page);
put_page(page);
@@ -1820,6 +1821,13 @@ static void fuse_resend(struct fuse_conn *fc)
}
spin_lock(&fiq->lock);
+ if (!fiq->connected) {
+ spin_unlock(&fiq->lock);
+ list_for_each_entry(req, &to_queue, list)
+ clear_bit(FR_PENDING, &req->flags);
+ end_requests(&to_queue);
+ return;
+ }
/* iq and pq requests are both oldest to newest */
list_splice(&to_queue, &fiq->pending);
fiq->ops->wake_pending_and_unlock(fiq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2b0d4781f394..8e96df9fd76c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -670,7 +670,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
err = get_create_ext(&args, dir, entry, mode);
if (err)
- goto out_put_forget_req;
+ goto out_free_ff;
err = fuse_simple_request(fm, &args);
free_ext_value(&args);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f39456c65ed7..ba6df52a823e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1832,10 +1832,16 @@ __acquires(fi->lock)
fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
- /* After fuse_writepage_finish() aux request list is private */
+ /* After rb_erase() aux request list is private */
for (aux = wpa->next; aux; aux = next) {
+ struct backing_dev_info *bdi = inode_to_bdi(aux->inode);
+
next = aux->next;
aux->next = NULL;
+
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+ dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP);
+ wb_writeout_inc(&bdi->wb);
fuse_writepage_free(aux);
}
@@ -2387,76 +2393,77 @@ out:
* but how to implement it without killing performance need more thinking.
*/
static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
- struct page *page;
+ struct folio *folio;
loff_t fsize;
int err = -ENOMEM;
WARN_ON(!fc->writeback_cache);
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
goto error;
- fuse_wait_on_page_writeback(mapping->host, page->index);
+ fuse_wait_on_page_writeback(mapping->host, folio->index);
- if (PageUptodate(page) || len == PAGE_SIZE)
+ if (folio_test_uptodate(folio) || len >= folio_size(folio))
goto success;
/*
- * Check if the start this page comes after the end of file, in which
- * case the readpage can be optimized away.
+ * Check if the start of this folio comes after the end of file,
+ * in which case the readpage can be optimized away.
*/
fsize = i_size_read(mapping->host);
- if (fsize <= (pos & PAGE_MASK)) {
- size_t off = pos & ~PAGE_MASK;
+ if (fsize <= folio_pos(folio)) {
+ size_t off = offset_in_folio(folio, pos);
if (off)
- zero_user_segment(page, 0, off);
+ folio_zero_segment(folio, 0, off);
goto success;
}
- err = fuse_do_readpage(file, page);
+ err = fuse_do_readpage(file, &folio->page);
if (err)
goto cleanup;
success:
- *pagep = page;
+ *foliop = folio;
return 0;
cleanup:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
error:
return err;
}
static int fuse_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
/* Haven't copied anything? Skip zeroing, size extending, dirtying. */
if (!copied)
goto unlock;
pos += copied;
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
/* Zero any unwritten bytes at the end of the page */
size_t endoff = pos & ~PAGE_MASK;
if (endoff)
- zero_user_segment(page, endoff, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segment(folio, endoff, PAGE_SIZE);
+ folio_mark_uptodate(folio);
}
if (pos > inode->i_size)
i_size_write(inode, pos);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d8ab4e93916f..bebd89002328 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1332,11 +1332,16 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
* on a stacked fs (e.g. overlayfs) themselves and with
* max_stack_depth == 1, FUSE fs can be stacked as the
* underlying fs of a stacked fs (e.g. overlayfs).
+ *
+ * Also don't allow the combination of FUSE_PASSTHROUGH
+ * and FUSE_WRITEBACK_CACHE, current design doesn't handle
+ * them together.
*/
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) &&
(flags & FUSE_PASSTHROUGH) &&
arg->max_stack_depth > 0 &&
- arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) {
+ arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH &&
+ !(flags & FUSE_WRITEBACK_CACHE)) {
fc->passthrough = 1;
fc->max_stack_depth = arg->max_stack_depth;
fm->sb->s_stack_depth = arg->max_stack_depth;
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 5b423fdbb13f..9f568d345c51 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -81,7 +81,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
}
ret = fuse_simple_request(fm, &args);
if (!ret && !size)
- ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX);
+ ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX);
if (ret == -ENOSYS) {
fm->fc->no_getxattr = 1;
ret = -EOPNOTSUPP;
@@ -143,7 +143,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
}
ret = fuse_simple_request(fm, &args);
if (!ret && !size)
- ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX);
+ ret = min_t(size_t, outarg.size, XATTR_LIST_MAX);
if (ret > 0 && size)
ret = fuse_verify_xattr_list(list, ret);
if (ret == -ENOSYS) {
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 6d1878b99b30..4a0ce131e233 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -487,15 +487,15 @@ void hfs_file_truncate(struct inode *inode)
if (inode->i_size > HFS_I(inode)->phys_size) {
struct address_space *mapping = inode->i_mapping;
void *fsdata = NULL;
- struct page *page;
+ struct folio *folio;
/* XXX: Can use generic_cont_expand? */
size = inode->i_size - 1;
- res = hfs_write_begin(NULL, mapping, size + 1, 0, &page,
+ res = hfs_write_begin(NULL, mapping, size + 1, 0, &folio,
&fsdata);
if (!res) {
res = generic_write_end(NULL, mapping, size + 1, 0, 0,
- page, fsdata);
+ folio, fsdata);
}
if (res)
inode->i_size = HFS_I(inode)->phys_size;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index b5a6ad5df357..a0c7cb0f79fc 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -202,7 +202,7 @@ extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;
int hfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata);
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
extern int hfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 744e10b46904..a81ce7a740b9 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -45,12 +45,11 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
}
int hfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
- *pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
hfs_get_block,
&HFS_I(mapping->host)->phys_size);
if (unlikely(ret))
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 9c51867dddc5..a6d61685ae79 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -554,16 +554,16 @@ void hfsplus_file_truncate(struct inode *inode)
if (inode->i_size > hip->phys_size) {
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
loff_t size = inode->i_size;
res = hfsplus_write_begin(NULL, mapping, size, 0,
- &page, &fsdata);
+ &folio, &fsdata);
if (res)
return;
res = generic_write_end(NULL, mapping, size, 0, 0,
- page, fsdata);
+ folio, fsdata);
if (res < 0)
return;
mark_inode_dirty(inode);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 9e78f181c24f..59ce81dca73f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -472,7 +472,7 @@ extern const struct address_space_operations hfsplus_btree_aops;
extern const struct dentry_operations hfsplus_dentry_operations;
int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata);
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode);
void hfsplus_delete_inode(struct inode *inode);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 3d326926c195..f331e9574217 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -39,12 +39,11 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
}
int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
- *pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
hfsplus_get_block,
&HFSPLUS_I(mapping->host)->phys_size);
if (unlikely(ret))
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 22df574ca99e..6d1cf2436ead 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -465,31 +465,32 @@ static int hostfs_read_folio(struct file *file, struct folio *folio)
static int hostfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
pgoff_t index = pos >> PAGE_SHIFT;
- *pagep = grab_cache_page_write_begin(mapping, index);
- if (!*pagep)
+ *foliop = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (!*foliop)
return -ENOMEM;
return 0;
}
static int hostfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
void *buffer;
- unsigned from = pos & (PAGE_SIZE - 1);
+ size_t from = offset_in_folio(folio, pos);
int err;
- buffer = kmap_local_page(page);
- err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
+ buffer = kmap_local_folio(folio, from);
+ err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied);
kunmap_local(buffer);
- if (!PageUptodate(page) && err == PAGE_SIZE)
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio) && err == folio_size(folio))
+ folio_mark_uptodate(folio);
/*
* If err > 0, write_file has added err to pos, so we are comparing
@@ -497,8 +498,8 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
*/
if (err > 0 && (pos > inode->i_size))
inode->i_size = pos;
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return err;
}
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 1bb8d97cd9ae..449a3fc1b8d9 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -190,12 +190,11 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
static int hpfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- *pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
+ ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -206,11 +205,11 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
static int hpfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *pagep, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int err;
- err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+ err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
if (err < len)
hpfs_write_failed(mapping, pos + len);
if (!(err < 0)) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9f6cff356796..5cf327337e22 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -388,14 +388,14 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
static int hugetlbfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
return -EINVAL;
}
static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
BUG();
return -EINVAL;
diff --git a/fs/inode.c b/fs/inode.c
index 86670941884b..af78f515403f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -472,6 +472,17 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
inode->i_state |= I_REFERENCED;
}
+struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+ struct inode *inode, u32 bit)
+{
+ void *bit_address;
+
+ bit_address = inode_state_wait_address(inode, bit);
+ init_wait_var_entry(wqe, bit_address, 0);
+ return __var_waitqueue(bit_address);
+}
+EXPORT_SYMBOL(inode_bit_waitqueue);
+
/*
* Add inode to LRU if needed (inode is unused and clean).
*
@@ -488,6 +499,49 @@ static void inode_lru_list_del(struct inode *inode)
this_cpu_dec(nr_unused);
}
+static void inode_pin_lru_isolating(struct inode *inode)
+{
+ lockdep_assert_held(&inode->i_lock);
+ WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
+ inode->i_state |= I_LRU_ISOLATING;
+}
+
+static void inode_unpin_lru_isolating(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
+ inode->i_state &= ~I_LRU_ISOLATING;
+ /* Called with inode->i_lock which ensures memory ordering. */
+ inode_wake_up_bit(inode, __I_LRU_ISOLATING);
+ spin_unlock(&inode->i_lock);
+}
+
+static void inode_wait_for_lru_isolating(struct inode *inode)
+{
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
+
+ lockdep_assert_held(&inode->i_lock);
+ if (!(inode->i_state & I_LRU_ISOLATING))
+ return;
+
+ wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
+ for (;;) {
+ prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+ /*
+ * Checking I_LRU_ISOLATING with inode->i_lock guarantees
+ * memory ordering.
+ */
+ if (!(inode->i_state & I_LRU_ISOLATING))
+ break;
+ spin_unlock(&inode->i_lock);
+ schedule();
+ spin_lock(&inode->i_lock);
+ }
+ finish_wait(wq_head, &wqe.wq_entry);
+ WARN_ON(inode->i_state & I_LRU_ISOLATING);
+}
+
/**
* inode_sb_list_add - add inode to the superblock list of inodes
* @inode: inode to add
@@ -562,6 +616,7 @@ void dump_mapping(const struct address_space *mapping)
struct hlist_node *dentry_first;
struct dentry *dentry_ptr;
struct dentry dentry;
+ char fname[64] = {};
unsigned long ino;
/*
@@ -598,11 +653,14 @@ void dump_mapping(const struct address_space *mapping)
return;
}
+ if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0)
+ strscpy(fname, "<invalid>");
/*
- * if dentry is corrupted, the %pd handler may still crash,
- * but it's unlikely that we reach here with a corrupt mapping
+ * Even if strncpy_from_kernel_nofault() succeeded,
+ * the fname could be unreliable
*/
- pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
+ pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n",
+ a_ops, ino, fname);
}
void clear_inode(struct inode *inode)
@@ -657,6 +715,9 @@ static void evict(struct inode *inode)
inode_sb_list_del(inode);
+ spin_lock(&inode->i_lock);
+ inode_wait_for_lru_isolating(inode);
+
/*
* Wait for flusher thread to be done with the inode so that filesystem
* does not start destroying it while writeback is still running. Since
@@ -664,6 +725,7 @@ static void evict(struct inode *inode)
* the inode. We just have to wait for running writeback to finish.
*/
inode_wait_for_writeback(inode);
+ spin_unlock(&inode->i_lock);
if (op->evict_inode) {
op->evict_inode(inode);
@@ -687,7 +749,13 @@ static void evict(struct inode *inode)
* used as an indicator whether blocking on it is safe.
*/
spin_lock(&inode->i_lock);
- wake_up_bit(&inode->i_state, __I_NEW);
+ /*
+ * Pairs with the barrier in prepare_to_wait_event() to make sure
+ * ___wait_var_event() either sees the bit cleared or
+ * waitqueue_active() check in wake_up_var() sees the waiter.
+ */
+ smp_mb();
+ inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
spin_unlock(&inode->i_lock);
@@ -735,6 +803,10 @@ again:
continue;
spin_lock(&inode->i_lock);
+ if (atomic_read(&inode->i_count)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
continue;
@@ -855,7 +927,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
* be under pressure before the cache inside the highmem zone.
*/
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
- __iget(inode);
+ inode_pin_lru_isolating(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
if (remove_inode_buffers(inode)) {
@@ -867,7 +939,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
__count_vm_events(PGINODESTEAL, reap);
mm_account_reclaimed_pages(reap);
}
- iput(inode);
+ inode_unpin_lru_isolating(inode);
spin_lock(lru_lock);
return LRU_RETRY;
}
@@ -1095,8 +1167,13 @@ void unlock_new_inode(struct inode *inode)
spin_lock(&inode->i_lock);
WARN_ON(!(inode->i_state & I_NEW));
inode->i_state &= ~I_NEW & ~I_CREATING;
+ /*
+ * Pairs with the barrier in prepare_to_wait_event() to make sure
+ * ___wait_var_event() either sees the bit cleared or
+ * waitqueue_active() check in wake_up_var() sees the waiter.
+ */
smp_mb();
- wake_up_bit(&inode->i_state, __I_NEW);
+ inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);
@@ -1107,8 +1184,13 @@ void discard_new_inode(struct inode *inode)
spin_lock(&inode->i_lock);
WARN_ON(!(inode->i_state & I_NEW));
inode->i_state &= ~I_NEW;
+ /*
+ * Pairs with the barrier in prepare_to_wait_event() to make sure
+ * ___wait_var_event() either sees the bit cleared or
+ * waitqueue_active() check in wake_up_var() sees the waiter.
+ */
smp_mb();
- wake_up_bit(&inode->i_state, __I_NEW);
+ inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
iput(inode);
}
@@ -1535,9 +1617,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
again:
- spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino, true);
- spin_unlock(&inode_hash_lock);
+ inode = find_inode_fast(sb, head, ino, false);
if (inode) {
if (IS_ERR(inode))
@@ -2299,8 +2379,8 @@ EXPORT_SYMBOL(inode_needs_sync);
*/
static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
{
- wait_queue_head_t *wq;
- DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
/*
* Handle racing against evict(), see that routine for more details.
@@ -2311,14 +2391,14 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
return;
}
- wq = bit_waitqueue(&inode->i_state, __I_NEW);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
+ prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
if (is_inode_hash_locked)
spin_unlock(&inode_hash_lock);
schedule();
- finish_wait(wq, &wait.wq_entry);
+ finish_wait(wq_head, &wqe.wq_entry);
if (is_inode_hash_locked)
spin_lock(&inode_hash_lock);
rcu_read_lock();
@@ -2467,18 +2547,11 @@ EXPORT_SYMBOL(inode_owner_or_capable);
/*
* Direct i/o helper functions
*/
-static void __inode_dio_wait(struct inode *inode)
+bool inode_dio_finished(const struct inode *inode)
{
- wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
- DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
-
- do {
- prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
- if (atomic_read(&inode->i_dio_count))
- schedule();
- } while (atomic_read(&inode->i_dio_count));
- finish_wait(wq, &q.wq_entry);
+ return atomic_read(&inode->i_dio_count) == 0;
}
+EXPORT_SYMBOL(inode_dio_finished);
/**
* inode_dio_wait - wait for outstanding DIO requests to finish
@@ -2492,11 +2565,17 @@ static void __inode_dio_wait(struct inode *inode)
*/
void inode_dio_wait(struct inode *inode)
{
- if (atomic_read(&inode->i_dio_count))
- __inode_dio_wait(inode);
+ wait_var_event(&inode->i_dio_count, inode_dio_finished(inode));
}
EXPORT_SYMBOL(inode_dio_wait);
+void inode_dio_wait_interruptible(struct inode *inode)
+{
+ wait_var_event_interruptible(&inode->i_dio_count,
+ inode_dio_finished(inode));
+}
+EXPORT_SYMBOL(inode_dio_wait_interruptible);
+
/*
* inode_set_flags - atomically set some inode flags
*
diff --git a/fs/internal.h b/fs/internal.h
index cdd73209eecb..8c1b7acbbe8f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -337,3 +337,4 @@ static inline bool path_mounted(const struct path *path)
{
return path->mnt->mnt_root == path->dentry;
}
+void file_f_owner_release(struct file *file);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f420c53d86ac..9b4ca3811a24 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -900,7 +900,7 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t bh_written;
bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
- len, copied, &folio->page, NULL);
+ len, copied, folio, NULL);
WARN_ON_ONCE(bh_written != copied && bh_written != 0);
return bh_written == copied;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e12cb145147e..13c18ccc13b0 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -23,10 +23,10 @@
static int jffs2_write_end(struct file *filp, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *pg, void *fsdata);
+ struct folio *folio, void *fsdata);
static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata);
+ struct folio **foliop, void **fsdata);
static int jffs2_read_folio(struct file *filp, struct folio *folio);
int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
@@ -77,29 +77,27 @@ const struct address_space_operations jffs2_file_address_operations =
.write_end = jffs2_write_end,
};
-static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
+static int jffs2_do_readpage_nolock(struct inode *inode, struct folio *folio)
{
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
- unsigned char *pg_buf;
+ unsigned char *kaddr;
int ret;
jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n",
- __func__, inode->i_ino, pg->index << PAGE_SHIFT);
+ __func__, inode->i_ino, folio->index << PAGE_SHIFT);
- BUG_ON(!PageLocked(pg));
+ BUG_ON(!folio_test_locked(folio));
- pg_buf = kmap(pg);
- /* FIXME: Can kmap fail? */
-
- ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT,
+ kaddr = kmap_local_folio(folio, 0);
+ ret = jffs2_read_inode_range(c, f, kaddr, folio->index << PAGE_SHIFT,
PAGE_SIZE);
+ kunmap_local(kaddr);
if (!ret)
- SetPageUptodate(pg);
+ folio_mark_uptodate(folio);
- flush_dcache_page(pg);
- kunmap(pg);
+ flush_dcache_folio(folio);
jffs2_dbg(2, "readpage finished\n");
return ret;
@@ -107,7 +105,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
int __jffs2_read_folio(struct file *file, struct folio *folio)
{
- int ret = jffs2_do_readpage_nolock(folio->mapping->host, &folio->page);
+ int ret = jffs2_do_readpage_nolock(folio->mapping->host, folio);
folio_unlock(folio);
return ret;
}
@@ -125,9 +123,9 @@ static int jffs2_read_folio(struct file *file, struct folio *folio)
static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
- struct page *pg;
+ struct folio *folio;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
@@ -206,29 +204,30 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* page in read_cache_page(), which causes a deadlock.
*/
mutex_lock(&c->alloc_sem);
- pg = grab_cache_page_write_begin(mapping, index);
- if (!pg) {
- ret = -ENOMEM;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto release_sem;
}
- *pagep = pg;
+ *foliop = folio;
/*
- * Read in the page if it wasn't already present. Cannot optimize away
- * the whole page write case until jffs2_write_end can handle the
+ * Read in the folio if it wasn't already present. Cannot optimize away
+ * the whole folio write case until jffs2_write_end can handle the
* case of a short-copy.
*/
- if (!PageUptodate(pg)) {
+ if (!folio_test_uptodate(folio)) {
mutex_lock(&f->sem);
- ret = jffs2_do_readpage_nolock(inode, pg);
+ ret = jffs2_do_readpage_nolock(inode, folio);
mutex_unlock(&f->sem);
if (ret) {
- unlock_page(pg);
- put_page(pg);
+ folio_unlock(folio);
+ folio_put(folio);
goto release_sem;
}
}
- jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
+ jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags);
release_sem:
mutex_unlock(&c->alloc_sem);
@@ -238,7 +237,7 @@ out_err:
static int jffs2_write_end(struct file *filp, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *pg, void *fsdata)
+ struct folio *folio, void *fsdata)
{
/* Actually commit the write from the page cache page we're looking at.
* For now, we write the full page out each time. It sucks, but it's simple
@@ -252,16 +251,17 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
unsigned aligned_start = start & ~3;
int ret = 0;
uint32_t writtenlen = 0;
+ void *buf;
- jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
- __func__, inode->i_ino, pg->index << PAGE_SHIFT,
- start, end, pg->flags);
+ jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n",
+ __func__, inode->i_ino, folio_pos(folio),
+ start, end, folio->flags);
/* We need to avoid deadlock with page_cache_read() in
- jffs2_garbage_collect_pass(). So the page must be
+ jffs2_garbage_collect_pass(). So the folio must be
up to date to prevent page_cache_read() from trying
to re-lock it. */
- BUG_ON(!PageUptodate(pg));
+ BUG_ON(!folio_test_uptodate(folio));
if (end == PAGE_SIZE) {
/* When writing out the end of a page, write out the
@@ -276,8 +276,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
if (!ri) {
jffs2_dbg(1, "%s(): Allocation of raw inode failed\n",
__func__);
- unlock_page(pg);
- put_page(pg);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
@@ -289,15 +289,11 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
ri->isize = cpu_to_je32((uint32_t)inode->i_size);
ri->atime = ri->ctime = ri->mtime = cpu_to_je32(JFFS2_NOW());
- /* In 2.4, it was already kmapped by generic_file_write(). Doesn't
- hurt to do it again. The alternative is ifdefs, which are ugly. */
- kmap(pg);
-
- ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start,
- (pg->index << PAGE_SHIFT) + aligned_start,
+ buf = kmap_local_folio(folio, aligned_start);
+ ret = jffs2_write_inode_range(c, f, ri, buf,
+ folio_pos(folio) + aligned_start,
end - aligned_start, &writtenlen);
-
- kunmap(pg);
+ kunmap_local(buf);
if (ret)
mapping_set_error(mapping, ret);
@@ -323,12 +319,12 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
it gets reread */
jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n",
__func__);
- ClearPageUptodate(pg);
+ folio_clear_uptodate(folio);
}
jffs2_dbg(1, "%s() returning %d\n",
__func__, writtenlen > 0 ? writtenlen : ret);
- unlock_page(pg);
- put_page(pg);
+ folio_unlock(folio);
+ folio_put(folio);
return writtenlen > 0 ? writtenlen : ret;
}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 5c6602f3c189..822949d0eb00 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1171,7 +1171,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
uint32_t alloclen, offset, orig_end, orig_start;
int ret = 0;
unsigned char *comprbuf = NULL, *writebuf;
- struct page *page;
+ struct folio *folio;
unsigned char *pg_ptr;
memset(&ri, 0, sizeof(ri));
@@ -1317,25 +1317,25 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
BUG_ON(start > orig_start);
}
- /* The rules state that we must obtain the page lock *before* f->sem, so
+ /* The rules state that we must obtain the folio lock *before* f->sem, so
* drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
* actually going to *change* so we're safe; we only allow reading.
*
* It is important to note that jffs2_write_begin() will ensure that its
- * page is marked Uptodate before allocating space. That means that if we
- * end up here trying to GC the *same* page that jffs2_write_begin() is
- * trying to write out, read_cache_page() will not deadlock. */
+ * folio is marked uptodate before allocating space. That means that if we
+ * end up here trying to GC the *same* folio that jffs2_write_begin() is
+ * trying to write out, read_cache_folio() will not deadlock. */
mutex_unlock(&f->sem);
- page = read_cache_page(inode->i_mapping, start >> PAGE_SHIFT,
+ folio = read_cache_folio(inode->i_mapping, start >> PAGE_SHIFT,
__jffs2_read_folio, NULL);
- if (IS_ERR(page)) {
- pr_warn("read_cache_page() returned error: %ld\n",
- PTR_ERR(page));
+ if (IS_ERR(folio)) {
+ pr_warn("read_cache_folio() returned error: %ld\n",
+ PTR_ERR(folio));
mutex_lock(&f->sem);
- return PTR_ERR(page);
+ return PTR_ERR(folio);
}
- pg_ptr = kmap(page);
+ pg_ptr = kmap_local_folio(folio, 0);
mutex_lock(&f->sem);
offset = start;
@@ -1400,7 +1400,6 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
}
}
- kunmap(page);
- put_page(page);
+ folio_release_kmap(folio, pg_ptr);
return ret;
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 1a6b5921d17a..07cfdc440596 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -292,11 +292,11 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
static int jfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block);
+ ret = block_write_begin(mapping, pos, len, foliop, jfs_get_block);
if (unlikely(ret))
jfs_write_failed(mapping, pos + len);
@@ -304,12 +304,12 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
}
static int jfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct page *page,
+ loff_t pos, unsigned len, unsigned copied, struct folio *folio,
void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
jfs_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c
index 575cb2ba74fc..5f4b305030ad 100644
--- a/fs/jfs/jfs_discard.c
+++ b/fs/jfs/jfs_discard.c
@@ -65,7 +65,7 @@ void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks)
int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
{
struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
- struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+ struct bmap *bmp;
struct super_block *sb = ipbmap->i_sb;
int agno, agno_end;
u64 start, end, minlen;
@@ -83,10 +83,15 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
if (minlen == 0)
minlen = 1;
+ down_read(&sb->s_umount);
+ bmp = JFS_SBI(ip->i_sb)->bmap;
+
if (minlen > bmp->db_agsize ||
start >= bmp->db_mapsize ||
- range->len < sb->s_blocksize)
+ range->len < sb->s_blocksize) {
+ up_read(&sb->s_umount);
return -EINVAL;
+ }
if (end >= bmp->db_mapsize)
end = bmp->db_mapsize - 1;
@@ -100,6 +105,8 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
trimmed += dbDiscardAG(ip, agno, minlen);
agno++;
}
+
+ up_read(&sb->s_umount);
range->len = trimmed << sb->s_blocksize_bits;
return 0;
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 5713994328cb..974ecf5e0d95 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -187,7 +187,7 @@ int dbMount(struct inode *ipbmap)
}
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
- if (!bmp->db_numag) {
+ if (!bmp->db_numag || bmp->db_numag >= MAXAG) {
err = -EINVAL;
goto err_release_metapage;
}
@@ -652,7 +652,7 @@ int dbNextAG(struct inode *ipbmap)
* average free space.
*/
for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
- if (agpref == bmp->db_numag)
+ if (agpref >= bmp->db_numag)
agpref = 0;
if (atomic_read(&bmp->db_active[agpref]))
@@ -2944,9 +2944,10 @@ static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl)
static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl)
{
int ti, n = 0, k, x = 0;
- int max_size;
+ int max_size, max_idx;
max_size = is_ctl ? CTLTREESIZE : TREESIZE;
+ max_idx = is_ctl ? LPERCTL : LPERDMAP;
/* first check the root of the tree to see if there is
* sufficient free space.
@@ -2978,6 +2979,8 @@ static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl)
*/
assert(n < 4);
}
+ if (le32_to_cpu(tp->dmt_leafidx) >= max_idx)
+ return -ENOSPC;
/* set the return to the leftmost leaf describing sufficient
* free space.
@@ -3022,7 +3025,7 @@ static int dbFindBits(u32 word, int l2nb)
/* scan the word for nb free bits at nb alignments.
*/
- for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
+ for (bitno = 0; mask != 0; bitno += nb, mask = (mask >> nb)) {
if ((mask & word) == mask)
break;
}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 1407feccbc2d..a360b24ed320 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1360,7 +1360,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
/* get the ag number of this iag */
agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
dn_numag = JFS_SBI(pip->i_sb)->bmap->db_numag;
- if (agno < 0 || agno > dn_numag)
+ if (agno < 0 || agno > dn_numag || agno >= MAXAG)
return -EIO;
if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2999ed5d83f5..0fb05e314edf 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -434,6 +434,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
int rc;
int quota_allocation = 0;
+ memset(&ea_buf->new_ea, 0, sizeof(ea_buf->new_ea));
+
/* When fsck.jfs clears a bad ea, it doesn't clear the size */
if (ji->ea.flag == 0)
ea_size = 0;
diff --git a/fs/libfs.c b/fs/libfs.c
index 8aa34870449f..46966fd8bcf9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -450,6 +450,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
mtree_destroy(&octx->mt);
}
+static int offset_dir_open(struct inode *inode, struct file *file)
+{
+ struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
+
+ file->private_data = (void *)ctx->next_offset;
+ return 0;
+}
+
/**
* offset_dir_llseek - Advance the read position of a directory descriptor
* @file: an open directory whose position is to be updated
@@ -463,6 +471,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
*/
static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
{
+ struct inode *inode = file->f_inode;
+ struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
+
switch (whence) {
case SEEK_CUR:
offset += file->f_pos;
@@ -476,7 +487,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
}
/* In this case, ->private_data is protected by f_pos_lock */
- file->private_data = NULL;
+ if (!offset)
+ file->private_data = (void *)ctx->next_offset;
return vfs_setpos(file, offset, LONG_MAX);
}
@@ -507,7 +519,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
inode->i_ino, fs_umode_to_dtype(inode->i_mode));
}
-static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
{
struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
struct dentry *dentry;
@@ -515,17 +527,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
while (true) {
dentry = offset_find_next(octx, ctx->pos);
if (!dentry)
- return ERR_PTR(-ENOENT);
+ return;
+
+ if (dentry2offset(dentry) >= last_index) {
+ dput(dentry);
+ return;
+ }
if (!offset_dir_emit(ctx, dentry)) {
dput(dentry);
- break;
+ return;
}
ctx->pos = dentry2offset(dentry) + 1;
dput(dentry);
}
- return NULL;
}
/**
@@ -552,22 +568,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
static int offset_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dir = file->f_path.dentry;
+ long last_index = (long)file->private_data;
lockdep_assert_held(&d_inode(dir)->i_rwsem);
if (!dir_emit_dots(file, ctx))
return 0;
- /* In this case, ->private_data is protected by f_pos_lock */
- if (ctx->pos == DIR_OFFSET_MIN)
- file->private_data = NULL;
- else if (file->private_data == ERR_PTR(-ENOENT))
- return 0;
- file->private_data = offset_iterate_dir(d_inode(dir), ctx);
+ offset_iterate_dir(d_inode(dir), ctx, last_index);
return 0;
}
const struct file_operations simple_offset_dir_operations = {
+ .open = offset_dir_open,
.llseek = offset_dir_llseek,
.iterate_shared = offset_readdir,
.read = generic_read_dir,
@@ -901,7 +914,7 @@ static int simple_read_folio(struct file *file, struct folio *folio)
int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct folio *folio;
@@ -910,7 +923,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
if (IS_ERR(folio))
return PTR_ERR(folio);
- *pagep = &folio->page;
+ *foliop = folio;
if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
size_t from = offset_in_folio(folio, pos);
@@ -929,11 +942,11 @@ EXPORT_SYMBOL(simple_write_begin);
* @pos: "
* @len: "
* @copied: "
- * @page: "
+ * @folio: "
* @fsdata: "
*
- * simple_write_end does the minimum needed for updating a page after writing is
- * done. It has the same API signature as the .write_end of
+ * simple_write_end does the minimum needed for updating a folio after
+ * writing is done. It has the same API signature as the .write_end of
* address_space_operations vector. So it can just be set onto .write_end for
* FSes that don't need any other processing. i_mutex is assumed to be held.
* Block based filesystems should use generic_write_end().
@@ -946,9 +959,8 @@ EXPORT_SYMBOL(simple_write_begin);
*/
static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
@@ -1990,13 +2002,19 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force)
* information, but the legacy inode_inc_iversion code used a spinlock
* to serialize increments.
*
- * Here, we add full memory barriers to ensure that any de-facto
- * ordering with other info is preserved.
+ * We add a full memory barrier to ensure that any de facto ordering
+ * with other state is preserved (either implicitly coming from cmpxchg
+ * or explicitly from smp_mb if we don't know upfront if we will execute
+ * the former).
*
- * This barrier pairs with the barrier in inode_query_iversion()
+ * These barriers pair with inode_query_iversion().
*/
- smp_mb();
cur = inode_peek_iversion_raw(inode);
+ if (!force && !(cur & I_VERSION_QUERIED)) {
+ smp_mb();
+ cur = inode_peek_iversion_raw(inode);
+ }
+
do {
/* If flag is clear then we needn't do anything */
if (!force && !(cur & I_VERSION_QUERIED))
@@ -2025,20 +2043,22 @@ EXPORT_SYMBOL(inode_maybe_inc_iversion);
u64 inode_query_iversion(struct inode *inode)
{
u64 cur, new;
+ bool fenced = false;
+ /*
+ * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with
+ * inode_maybe_inc_iversion(), see that routine for more details.
+ */
cur = inode_peek_iversion_raw(inode);
do {
/* If flag is already set, then no need to swap */
if (cur & I_VERSION_QUERIED) {
- /*
- * This barrier (and the implicit barrier in the
- * cmpxchg below) pairs with the barrier in
- * inode_maybe_inc_iversion().
- */
- smp_mb();
+ if (!fenced)
+ smp_mb();
break;
}
+ fenced = true;
new = cur | I_VERSION_QUERIED;
} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
return cur >> I_VERSION_QUERIED_SHIFT;
@@ -2104,12 +2124,12 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
}
EXPORT_SYMBOL(simple_inode_init_ts);
-static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
{
struct dentry *dentry;
guard(rcu)();
- dentry = READ_ONCE(stashed);
+ dentry = rcu_dereference(*stashed);
if (!dentry)
return NULL;
if (!lockref_get_not_dead(&dentry->d_lockref))
@@ -2206,7 +2226,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
/* See if dentry can be reused. */
- path->dentry = get_stashed_dentry(*stashed);
+ path->dentry = get_stashed_dentry(stashed);
if (path->dentry) {
sops->put_data(data);
goto out_path;
diff --git a/fs/locks.c b/fs/locks.c
index 9afb16e0683f..b51b1c395ce6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1451,7 +1451,7 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
struct file *filp = fl->c.flc_file;
f_delown(filp);
- filp->f_owner.signum = 0;
+ file_f_owner(filp)->signum = 0;
fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
if (fl->fl_fasync != NULL) {
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
@@ -1783,6 +1783,10 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr
lease = *flp;
trace_generic_add_lease(inode, lease);
+ error = file_f_owner_allocate(filp);
+ if (error)
+ return error;
+
/* Note that arg is never F_UNLCK here */
ctx = locks_get_lock_context(inode, arg);
if (!ctx)
@@ -2984,7 +2988,7 @@ static int __init filelock_init(void)
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
- filelease_cache = kmem_cache_create("file_lock_cache",
+ filelease_cache = kmem_cache_create("file_lease_cache",
sizeof(struct file_lease), 0, SLAB_PANIC, NULL);
for_each_possible_cpu(i) {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a224cf222570..dd2a425b41f0 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -40,18 +40,18 @@ minix_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *dir = mapping->host;
- block_write_end(NULL, mapping, pos, len, len, page, NULL);
+ block_write_end(NULL, mapping, pos, len, len, folio, NULL);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
- unlock_page(page);
+ folio_unlock(folio);
}
static int minix_handle_dirsync(struct inode *dir)
@@ -64,14 +64,15 @@ static int minix_handle_dirsync(struct inode *dir)
return err;
}
-static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p)
+static void *dir_get_folio(struct inode *dir, unsigned long n,
+ struct folio **foliop)
{
- struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
- if (IS_ERR(page))
- return ERR_CAST(page);
- *p = page;
- return kmap_local_page(page);
+ struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL);
+
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
+ *foliop = folio;
+ return kmap_local_folio(folio, 0);
}
static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
@@ -99,9 +100,9 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
for ( ; n < npages; n++, offset = 0) {
char *p, *kaddr, *limit;
- struct page *page;
+ struct folio *folio;
- kaddr = dir_get_page(inode, n, &page);
+ kaddr = dir_get_folio(inode, n, &folio);
if (IS_ERR(kaddr))
continue;
p = kaddr+offset;
@@ -122,13 +123,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
unsigned l = strnlen(name, sbi->s_namelen);
if (!dir_emit(ctx, name, l,
inumber, DT_UNKNOWN)) {
- unmap_and_put_page(page, p);
+ folio_release_kmap(folio, p);
return 0;
}
}
ctx->pos += chunk_size;
}
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
}
return 0;
}
@@ -144,12 +145,13 @@ static inline int namecompare(int len, int maxlen,
/*
* minix_find_entry()
*
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * finds an entry in the specified directory with the wanted name.
+ * It does NOT read the inode of the
* entry - you'll have to do that yourself if you want to.
+ *
+ * On Success folio_release_kmap() should be called on *foliop.
*/
-minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
+minix_dirent *minix_find_entry(struct dentry *dentry, struct folio **foliop)
{
const char * name = dentry->d_name.name;
int namelen = dentry->d_name.len;
@@ -158,17 +160,15 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
struct minix_sb_info * sbi = minix_sb(sb);
unsigned long n;
unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
char *p;
char *namx;
__u32 inumber;
- *res_page = NULL;
for (n = 0; n < npages; n++) {
char *kaddr, *limit;
- kaddr = dir_get_page(dir, n, &page);
+ kaddr = dir_get_folio(dir, n, foliop);
if (IS_ERR(kaddr))
continue;
@@ -188,12 +188,11 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
if (namecompare(namelen, sbi->s_namelen, name, namx))
goto found;
}
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(*foliop, kaddr);
}
return NULL;
found:
- *res_page = page;
return (minix_dirent *)p;
}
@@ -204,7 +203,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
int namelen = dentry->d_name.len;
struct super_block * sb = dir->i_sb;
struct minix_sb_info * sbi = minix_sb(sb);
- struct page *page = NULL;
+ struct folio *folio = NULL;
unsigned long npages = dir_pages(dir);
unsigned long n;
char *kaddr, *p;
@@ -223,10 +222,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
for (n = 0; n <= npages; n++) {
char *limit, *dir_end;
- kaddr = dir_get_page(dir, n, &page);
+ kaddr = dir_get_folio(dir, n, &folio);
if (IS_ERR(kaddr))
return PTR_ERR(kaddr);
- lock_page(page);
+ folio_lock(folio);
dir_end = kaddr + minix_last_byte(dir, n);
limit = kaddr + PAGE_SIZE - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
@@ -253,15 +252,15 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
if (namecompare(namelen, sbi->s_namelen, name, namx))
goto out_unlock;
}
- unlock_page(page);
- unmap_and_put_page(page, kaddr);
+ folio_unlock(folio);
+ folio_release_kmap(folio, kaddr);
}
BUG();
return -EINVAL;
got_it:
- pos = page_offset(page) + offset_in_page(p);
- err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
+ pos = folio_pos(folio) + offset_in_folio(folio, p);
+ err = minix_prepare_chunk(folio, pos, sbi->s_dirsize);
if (err)
goto out_unlock;
memcpy (namx, name, namelen);
@@ -272,37 +271,37 @@ got_it:
memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2);
de->inode = inode->i_ino;
}
- dir_commit_chunk(page, pos, sbi->s_dirsize);
+ dir_commit_chunk(folio, pos, sbi->s_dirsize);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = minix_handle_dirsync(dir);
out_put:
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
return err;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
goto out_put;
}
-int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
+int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- loff_t pos = page_offset(page) + offset_in_page(de);
+ struct inode *inode = folio->mapping->host;
+ loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
unsigned len = sbi->s_dirsize;
int err;
- lock_page(page);
- err = minix_prepare_chunk(page, pos, len);
+ folio_lock(folio);
+ err = minix_prepare_chunk(folio, pos, len);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
if (sbi->s_version == MINIX_V3)
((minix3_dirent *)de)->inode = 0;
else
de->inode = 0;
- dir_commit_chunk(page, pos, len);
+ dir_commit_chunk(folio, pos, len);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return minix_handle_dirsync(inode);
@@ -310,21 +309,21 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
int minix_make_empty(struct inode *inode, struct inode *dir)
{
- struct page *page = grab_cache_page(inode->i_mapping, 0);
+ struct folio *folio = filemap_grab_folio(inode->i_mapping, 0);
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
char *kaddr;
int err;
- if (!page)
- return -ENOMEM;
- err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ err = minix_prepare_chunk(folio, 0, 2 * sbi->s_dirsize);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
goto fail;
}
- kaddr = kmap_local_page(page);
- memset(kaddr, 0, PAGE_SIZE);
+ kaddr = kmap_local_folio(folio, 0);
+ memset(kaddr, 0, folio_size(folio));
if (sbi->s_version == MINIX_V3) {
minix3_dirent *de3 = (minix3_dirent *)kaddr;
@@ -345,10 +344,10 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
}
kunmap_local(kaddr);
- dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
+ dir_commit_chunk(folio, 0, 2 * sbi->s_dirsize);
err = minix_handle_dirsync(inode);
fail:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -357,7 +356,7 @@ fail:
*/
int minix_empty_dir(struct inode * inode)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
unsigned long i, npages = dir_pages(inode);
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
char *name, *kaddr;
@@ -366,7 +365,7 @@ int minix_empty_dir(struct inode * inode)
for (i = 0; i < npages; i++) {
char *p, *limit;
- kaddr = dir_get_page(inode, i, &page);
+ kaddr = dir_get_folio(inode, i, &folio);
if (IS_ERR(kaddr))
continue;
@@ -395,44 +394,44 @@ int minix_empty_dir(struct inode * inode)
goto not_empty;
}
}
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
}
return 1;
not_empty:
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
return 0;
}
/* Releases the page */
-int minix_set_link(struct minix_dir_entry *de, struct page *page,
+int minix_set_link(struct minix_dir_entry *de, struct folio *folio,
struct inode *inode)
{
- struct inode *dir = page->mapping->host;
+ struct inode *dir = folio->mapping->host;
struct minix_sb_info *sbi = minix_sb(dir->i_sb);
- loff_t pos = page_offset(page) + offset_in_page(de);
+ loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
int err;
- lock_page(page);
- err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
+ folio_lock(folio);
+ err = minix_prepare_chunk(folio, pos, sbi->s_dirsize);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
if (sbi->s_version == MINIX_V3)
((minix3_dirent *)de)->inode = inode->i_ino;
else
de->inode = inode->i_ino;
- dir_commit_chunk(page, pos, sbi->s_dirsize);
+ dir_commit_chunk(folio, pos, sbi->s_dirsize);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
return minix_handle_dirsync(dir);
}
-struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p)
+struct minix_dir_entry *minix_dotdot(struct inode *dir, struct folio **foliop)
{
struct minix_sb_info *sbi = minix_sb(dir->i_sb);
- struct minix_dir_entry *de = dir_get_page(dir, 0, p);
+ struct minix_dir_entry *de = dir_get_folio(dir, 0, foliop);
if (!IS_ERR(de))
return minix_next_entry(de, sbi);
@@ -441,20 +440,19 @@ struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p)
ino_t minix_inode_by_name(struct dentry *dentry)
{
- struct page *page;
- struct minix_dir_entry *de = minix_find_entry(dentry, &page);
+ struct folio *folio;
+ struct minix_dir_entry *de = minix_find_entry(dentry, &folio);
ino_t res = 0;
if (de) {
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = folio->mapping->host;
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
if (sbi->s_version == MINIX_V3)
res = ((minix3_dirent *) de)->inode;
else
res = de->inode;
- unmap_and_put_page(page, de);
+ folio_release_kmap(folio, de);
}
return res;
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 1c3df63162ef..f007e389d5d2 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -427,9 +427,9 @@ static int minix_read_folio(struct file *file, struct folio *folio)
return block_read_full_folio(folio, minix_get_block);
}
-int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- return __block_write_begin(page, pos, len, minix_get_block);
+ return __block_write_begin(folio, pos, len, minix_get_block);
}
static void minix_write_failed(struct address_space *mapping, loff_t to)
@@ -444,11 +444,11 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
static int minix_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, pagep, minix_get_block);
+ ret = block_write_begin(mapping, pos, len, foliop, minix_get_block);
if (unlikely(ret))
minix_write_failed(mapping, pos + len);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index d493507c064f..d54273c3c9ff 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -42,18 +42,18 @@ struct minix_sb_info {
unsigned short s_version;
};
-extern struct inode *minix_iget(struct super_block *, unsigned long);
-extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct inode * minix_new_inode(const struct inode *, umode_t);
-extern void minix_free_inode(struct inode * inode);
-extern unsigned long minix_count_free_inodes(struct super_block *sb);
-extern int minix_new_block(struct inode * inode);
-extern void minix_free_block(struct inode *inode, unsigned long block);
-extern unsigned long minix_count_free_blocks(struct super_block *sb);
-extern int minix_getattr(struct mnt_idmap *, const struct path *,
- struct kstat *, u32, unsigned int);
-extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+struct inode *minix_iget(struct super_block *, unsigned long);
+struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
+struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
+struct inode *minix_new_inode(const struct inode *, umode_t);
+void minix_free_inode(struct inode *inode);
+unsigned long minix_count_free_inodes(struct super_block *sb);
+int minix_new_block(struct inode *inode);
+void minix_free_block(struct inode *inode, unsigned long block);
+unsigned long minix_count_free_blocks(struct super_block *sb);
+int minix_getattr(struct mnt_idmap *, const struct path *,
+ struct kstat *, u32, unsigned int);
+int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len);
extern void V1_minix_truncate(struct inode *);
extern void V2_minix_truncate(struct inode *);
@@ -64,15 +64,15 @@ extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
extern unsigned V1_minix_blocks(loff_t, struct super_block *);
extern unsigned V2_minix_blocks(loff_t, struct super_block *);
-extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**);
-extern int minix_add_link(struct dentry*, struct inode*);
-extern int minix_delete_entry(struct minix_dir_entry*, struct page*);
-extern int minix_make_empty(struct inode*, struct inode*);
-extern int minix_empty_dir(struct inode*);
-int minix_set_link(struct minix_dir_entry *de, struct page *page,
+struct minix_dir_entry *minix_find_entry(struct dentry *, struct folio **);
+int minix_add_link(struct dentry*, struct inode*);
+int minix_delete_entry(struct minix_dir_entry *, struct folio *);
+int minix_make_empty(struct inode*, struct inode*);
+int minix_empty_dir(struct inode*);
+int minix_set_link(struct minix_dir_entry *de, struct folio *folio,
struct inode *inode);
-extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
-extern ino_t minix_inode_by_name(struct dentry*);
+struct minix_dir_entry *minix_dotdot(struct inode*, struct folio **);
+ino_t minix_inode_by_name(struct dentry*);
extern const struct inode_operations minix_file_inode_operations;
extern const struct inode_operations minix_dir_inode_operations;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index a944a0f17b53..5d9c1406fe27 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -141,15 +141,15 @@ out_fail:
static int minix_unlink(struct inode * dir, struct dentry *dentry)
{
struct inode * inode = d_inode(dentry);
- struct page * page;
+ struct folio *folio;
struct minix_dir_entry * de;
int err;
- de = minix_find_entry(dentry, &page);
+ de = minix_find_entry(dentry, &folio);
if (!de)
return -ENOENT;
- err = minix_delete_entry(de, page);
- unmap_and_put_page(page, de);
+ err = minix_delete_entry(de, folio);
+ folio_release_kmap(folio, de);
if (err)
return err;
@@ -180,28 +180,28 @@ static int minix_rename(struct mnt_idmap *idmap,
{
struct inode * old_inode = d_inode(old_dentry);
struct inode * new_inode = d_inode(new_dentry);
- struct page * dir_page = NULL;
+ struct folio * dir_folio = NULL;
struct minix_dir_entry * dir_de = NULL;
- struct page * old_page;
+ struct folio *old_folio;
struct minix_dir_entry * old_de;
int err = -ENOENT;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
- old_de = minix_find_entry(old_dentry, &old_page);
+ old_de = minix_find_entry(old_dentry, &old_folio);
if (!old_de)
goto out;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
- dir_de = minix_dotdot(old_inode, &dir_page);
+ dir_de = minix_dotdot(old_inode, &dir_folio);
if (!dir_de)
goto out_old;
}
if (new_inode) {
- struct page * new_page;
+ struct folio *new_folio;
struct minix_dir_entry * new_de;
err = -ENOTEMPTY;
@@ -209,11 +209,11 @@ static int minix_rename(struct mnt_idmap *idmap,
goto out_dir;
err = -ENOENT;
- new_de = minix_find_entry(new_dentry, &new_page);
+ new_de = minix_find_entry(new_dentry, &new_folio);
if (!new_de)
goto out_dir;
- err = minix_set_link(new_de, new_page, old_inode);
- unmap_and_put_page(new_page, new_de);
+ err = minix_set_link(new_de, new_folio, old_inode);
+ folio_release_kmap(new_folio, new_de);
if (err)
goto out_dir;
inode_set_ctime_current(new_inode);
@@ -228,22 +228,22 @@ static int minix_rename(struct mnt_idmap *idmap,
inode_inc_link_count(new_dir);
}
- err = minix_delete_entry(old_de, old_page);
+ err = minix_delete_entry(old_de, old_folio);
if (err)
goto out_dir;
mark_inode_dirty(old_inode);
if (dir_de) {
- err = minix_set_link(dir_de, dir_page, new_dir);
+ err = minix_set_link(dir_de, dir_folio, new_dir);
if (!err)
inode_dec_link_count(old_dir);
}
out_dir:
if (dir_de)
- unmap_and_put_page(dir_page, dir_de);
+ folio_release_kmap(dir_folio, dir_de);
out_old:
- unmap_and_put_page(old_page, old_de);
+ folio_release_kmap(old_folio, old_de);
out:
return err;
}
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 3c60f1eaca61..79491663dbc0 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -228,15 +228,15 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from,
return 0;
}
- forward = kmemdup(map_from->forward,
- nr_extents * sizeof(struct uid_gid_extent),
- GFP_KERNEL_ACCOUNT);
+ forward = kmemdup_array(map_from->forward, nr_extents,
+ sizeof(struct uid_gid_extent),
+ GFP_KERNEL_ACCOUNT);
if (!forward)
return -ENOMEM;
- reverse = kmemdup(map_from->reverse,
- nr_extents * sizeof(struct uid_gid_extent),
- GFP_KERNEL_ACCOUNT);
+ reverse = kmemdup_array(map_from->reverse, nr_extents,
+ sizeof(struct uid_gid_extent),
+ GFP_KERNEL_ACCOUNT);
if (!reverse) {
kfree(forward);
return -ENOMEM;
diff --git a/fs/mount.h b/fs/mount.h
index ad4b1ddebb54..185fc56afc13 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -153,5 +153,17 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
list_add_tail(&mnt->mnt_list, dt_list);
}
-extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
+struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
+static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
+{
+ return __lookup_next_mnt_ns(mntns, false);
+}
+static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
+{
+ return __lookup_next_mnt_ns(mntns, true);
+}
+static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct mnt_namespace, ns);
+}
diff --git a/fs/namei.c b/fs/namei.c
index 5512cb10fa89..891b169e38c9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1639,6 +1639,20 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
}
EXPORT_SYMBOL(lookup_one_qstr_excl);
+/**
+ * lookup_fast - do fast lockless (but racy) lookup of a dentry
+ * @nd: current nameidata
+ *
+ * Do a fast, but racy lookup in the dcache for the given dentry, and
+ * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
+ * found. On error, an ERR_PTR will be returned.
+ *
+ * If this function returns a valid dentry and the walk is no longer
+ * lazy, the dentry will carry a reference that must later be put. If
+ * RCU mode is still in force, then this is not the case and the dentry
+ * must be legitimized before use. If this returns NULL, then the walk
+ * will no longer be in RCU mode.
+ */
static struct dentry *lookup_fast(struct nameidata *nd)
{
struct dentry *dentry, *parent = nd->path.dentry;
@@ -3521,6 +3535,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
return dentry;
}
+ if (open_flag & O_CREAT)
+ audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
+
/*
* Checking write permission is tricky, bacuse we don't know if we are
* going to actually need it: O_CREAT opens should work as long as the
@@ -3591,6 +3608,42 @@ out_dput:
return ERR_PTR(error);
}
+static inline bool trailing_slashes(struct nameidata *nd)
+{
+ return (bool)nd->last.name[nd->last.len];
+}
+
+static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
+{
+ struct dentry *dentry;
+
+ if (open_flag & O_CREAT) {
+ if (trailing_slashes(nd))
+ return ERR_PTR(-EISDIR);
+
+ /* Don't bother on an O_EXCL create */
+ if (open_flag & O_EXCL)
+ return NULL;
+ }
+
+ if (trailing_slashes(nd))
+ nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+ dentry = lookup_fast(nd);
+ if (IS_ERR_OR_NULL(dentry))
+ return dentry;
+
+ if (open_flag & O_CREAT) {
+ /* Discard negative dentries. Need inode_lock to do the create */
+ if (!dentry->d_inode) {
+ if (!(nd->flags & LOOKUP_RCU))
+ dput(dentry);
+ dentry = NULL;
+ }
+ }
+ return dentry;
+}
+
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
@@ -3608,28 +3661,22 @@ static const char *open_last_lookups(struct nameidata *nd,
return handle_dots(nd, nd->last_type);
}
- if (!(open_flag & O_CREAT)) {
- if (nd->last.name[nd->last.len])
- nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
- /* we _can_ be in RCU mode here */
- dentry = lookup_fast(nd);
- if (IS_ERR(dentry))
- return ERR_CAST(dentry);
- if (likely(dentry))
- goto finish_lookup;
+ /* We _can_ be in RCU mode here */
+ dentry = lookup_fast_for_open(nd, open_flag);
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
+ if (likely(dentry))
+ goto finish_lookup;
+
+ if (!(open_flag & O_CREAT)) {
if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
return ERR_PTR(-ECHILD);
} else {
- /* create side of things */
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
}
- audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
- /* trailing slashes? */
- if (unlikely(nd->last.name[nd->last.len]))
- return ERR_PTR(-EISDIR);
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
@@ -5304,7 +5351,7 @@ int page_symlink(struct inode *inode, const char *symname, int len)
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
- struct page *page;
+ struct folio *folio;
void *fsdata = NULL;
int err;
unsigned int flags;
@@ -5312,16 +5359,16 @@ int page_symlink(struct inode *inode, const char *symname, int len)
retry:
if (nofs)
flags = memalloc_nofs_save();
- err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
+ err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
if (nofs)
memalloc_nofs_restore(flags);
if (err)
goto fail;
- memcpy(page_address(page), symname, len-1);
+ memcpy(folio_address(folio), symname, len - 1);
- err = aops->write_end(NULL, mapping, 0, len-1, len-1,
- page, fsdata);
+ err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
+ folio, fsdata);
if (err < 0)
goto fail;
if (err < len-1)
diff --git a/fs/namespace.c b/fs/namespace.c
index 328087a4df8a..e71e4564987b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1774,7 +1774,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
list_del_init(&p->mnt_child);
}
- /* Add propogated mounts to the tmp_list */
+ /* Add propagated mounts to the tmp_list */
if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
@@ -2060,14 +2060,41 @@ static bool is_mnt_ns_file(struct dentry *dentry)
dentry->d_fsdata == &mntns_operations;
}
-static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
- return container_of(ns, struct mnt_namespace, ns);
+ return &mnt->ns;
}
-struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
+struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
- return &mnt->ns;
+ guard(read_lock)(&mnt_ns_tree_lock);
+ for (;;) {
+ struct rb_node *node;
+
+ if (previous)
+ node = rb_prev(&mntns->mnt_ns_tree_node);
+ else
+ node = rb_next(&mntns->mnt_ns_tree_node);
+ if (!node)
+ return ERR_PTR(-ENOENT);
+
+ mntns = node_to_mnt_ns(node);
+ node = &mntns->mnt_ns_tree_node;
+
+ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
+ continue;
+
+ /*
+ * Holding mnt_ns_tree_lock prevents the mount namespace from
+ * being freed but it may well be on it's deathbed. We want an
+ * active reference, not just a passive one here as we're
+ * persisting the mount namespace.
+ */
+ if (!refcount_inc_not_zero(&mntns->ns.count))
+ continue;
+
+ return mntns;
+ }
}
static bool mnt_ns_loop(struct dentry *dentry)
@@ -2921,8 +2948,15 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
if (!__mnt_is_readonly(mnt) &&
(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
- char *buf = (char *)__get_free_page(GFP_KERNEL);
- char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
+ char *buf, *mntpath;
+
+ buf = (char *)__get_free_page(GFP_KERNEL);
+ if (buf)
+ mntpath = d_path(mountpoint, buf, PAGE_SIZE);
+ else
+ mntpath = ERR_PTR(-ENOMEM);
+ if (IS_ERR(mntpath))
+ mntpath = "(unknown)";
pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
sb->s_type->name,
@@ -2930,8 +2964,9 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
mntpath, &sb->s_time_max,
(unsigned long long)sb->s_time_max);
- free_page((unsigned long)buf);
sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
+ if (buf)
+ free_page((unsigned long)buf);
}
}
@@ -5243,12 +5278,37 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
* that, or if not simply grab a passive reference on our mount namespace and
* return that.
*/
-static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
{
- if (mnt_ns_id)
- return lookup_mnt_ns(mnt_ns_id);
- refcount_inc(&current->nsproxy->mnt_ns->passive);
- return current->nsproxy->mnt_ns;
+ struct mnt_namespace *mnt_ns;
+
+ if (kreq->mnt_ns_id && kreq->spare)
+ return ERR_PTR(-EINVAL);
+
+ if (kreq->mnt_ns_id)
+ return lookup_mnt_ns(kreq->mnt_ns_id);
+
+ if (kreq->spare) {
+ struct ns_common *ns;
+
+ CLASS(fd, f)(kreq->spare);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (!proc_ns_file(f.file))
+ return ERR_PTR(-EINVAL);
+
+ ns = get_proc_ns(file_inode(f.file));
+ if (ns->ops->type != CLONE_NEWNS)
+ return ERR_PTR(-EINVAL);
+
+ mnt_ns = to_mnt_ns(ns);
+ } else {
+ mnt_ns = current->nsproxy->mnt_ns;
+ }
+
+ refcount_inc(&mnt_ns->passive);
+ return mnt_ns;
}
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
@@ -5269,7 +5329,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
- ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;
@@ -5396,7 +5456,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (!kmnt_ids)
return -ENOMEM;
- ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ ns = grab_requested_mnt_ns(&kreq);
if (!ns)
return -ENOENT;
@@ -5605,7 +5665,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
/* Only worry about locked mounts */
if (!(child->mnt.mnt_flags & MNT_LOCKED))
continue;
- /* Is the directory permanetly empty? */
+ /* Is the directory permanently empty? */
if (!is_empty_dir_inode(inode))
goto next;
}
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index 1b78e8b65ebc..7701c037c328 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -24,7 +24,7 @@ config NETFS_STATS
config NETFS_DEBUG
bool "Enable dynamic debugging netfslib and FS-Cache"
- depends on NETFS
+ depends on NETFS_SUPPORT
help
This permits debugging to be dynamically enabled in the local caching
management module. If this is set, the debugging output may be
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 8e6781e0b10b..d08b0bfb6756 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -5,12 +5,14 @@ netfs-y := \
buffered_write.o \
direct_read.o \
direct_write.o \
- io.o \
iterator.o \
locking.o \
main.o \
misc.o \
objects.o \
+ read_collect.o \
+ read_pgpriv2.o \
+ read_retry.o \
write_collect.o \
write_issue.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index a688d4c75d99..c40e226053cc 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -9,126 +9,6 @@
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
-/*
- * Unlock the folios in a read operation. We need to set PG_writeback on any
- * folios we're going to write back before we unlock them.
- *
- * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use
- * PG_private_2 and do a direct write to the cache from here instead.
- */
-void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_folio *finfo;
- struct folio *folio;
- pgoff_t start_page = rreq->start / PAGE_SIZE;
- pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
- size_t account = 0;
- bool subreq_failed = false;
-
- XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
- if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
- __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
- }
- }
-
- /* Walk through the pagecache and the I/O request lists simultaneously.
- * We may have a mixture of cached and uncached sections and we only
- * really want to write out the uncached sections. This is slightly
- * complicated by the possibility that we might have huge pages with a
- * mixture inside.
- */
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- subreq_failed = (subreq->error < 0);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
-
- rcu_read_lock();
- xas_for_each(&xas, folio, last_page) {
- loff_t pg_end;
- bool pg_failed = false;
- bool wback_to_cache = false;
- bool folio_started = false;
-
- if (xas_retry(&xas, folio))
- continue;
-
- pg_end = folio_pos(folio) + folio_size(folio) - 1;
-
- for (;;) {
- loff_t sreq_end;
-
- if (!subreq) {
- pg_failed = true;
- break;
- }
- if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
- if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
- &subreq->flags)) {
- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
- folio_start_private_2(folio);
- folio_started = true;
- }
- } else {
- wback_to_cache |=
- test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
- }
- pg_failed |= subreq_failed;
- sreq_end = subreq->start + subreq->len - 1;
- if (pg_end < sreq_end)
- break;
-
- account += subreq->transferred;
- if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
- subreq = list_next_entry(subreq, rreq_link);
- subreq_failed = (subreq->error < 0);
- } else {
- subreq = NULL;
- subreq_failed = false;
- }
-
- if (pg_end == sreq_end)
- break;
- }
-
- if (!pg_failed) {
- flush_dcache_folio(folio);
- finfo = netfs_folio_info(folio);
- if (finfo) {
- trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
- if (finfo->netfs_group)
- folio_change_private(folio, finfo->netfs_group);
- else
- folio_detach_private(folio);
- kfree(finfo);
- }
- folio_mark_uptodate(folio);
- if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
- trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
- folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
- filemap_dirty_folio(folio->mapping, folio);
- }
- }
-
- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio->index == rreq->no_unlock_folio &&
- test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
- _debug("no unlock");
- else
- folio_unlock(folio);
- }
- }
- rcu_read_unlock();
-
- task_io_account_read(account);
- if (rreq->netfs_ops->done)
- rreq->netfs_ops->done(rreq);
-}
-
static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
unsigned long long *_start,
unsigned long long *_len,
@@ -183,6 +63,336 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
}
+/*
+ * Decant the list of folios to read into a rolling buffer.
+ */
+static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
+ struct folio_queue *folioq)
+{
+ unsigned int order, nr;
+ size_t size = 0;
+
+ nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios,
+ ARRAY_SIZE(folioq->vec.folios));
+ folioq->vec.nr = nr;
+ for (int i = 0; i < nr; i++) {
+ struct folio *folio = folioq_folio(folioq, i);
+
+ trace_netfs_folio(folio, netfs_folio_trace_read);
+ order = folio_order(folio);
+ folioq->orders[i] = order;
+ size += PAGE_SIZE << order;
+ }
+
+ for (int i = nr; i < folioq_nr_slots(folioq); i++)
+ folioq_clear(folioq, i);
+
+ return size;
+}
+
+/*
+ * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
+ * @subreq: The subrequest to be set up
+ *
+ * Prepare the I/O iterator representing the read buffer on a subrequest for
+ * the filesystem to use for I/O (it can be passed directly to a socket). This
+ * is intended to be called from the ->issue_read() method once the filesystem
+ * has trimmed the request to the size it wants.
+ *
+ * Returns the limited size if successful and -ENOMEM if insufficient memory
+ * available.
+ *
+ * [!] NOTE: This must be run in the same thread as ->issue_read() was called
+ * in as we access the readahead_control struct.
+ */
+static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ size_t rsize = subreq->len;
+
+ if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER)
+ rsize = umin(rsize, rreq->io_streams[0].sreq_max_len);
+
+ if (rreq->ractl) {
+ /* If we don't have sufficient folios in the rolling buffer,
+ * extract a folioq's worth from the readahead region at a time
+ * into the buffer. Note that this acquires a ref on each page
+ * that we will need to release later - but we don't want to do
+ * that until after we've started the I/O.
+ */
+ while (rreq->submitted < subreq->start + rsize) {
+ struct folio_queue *tail = rreq->buffer_tail, *new;
+ size_t added;
+
+ new = kmalloc(sizeof(*new), GFP_NOFS);
+ if (!new)
+ return -ENOMEM;
+ netfs_stat(&netfs_n_folioq);
+ folioq_init(new);
+ new->prev = tail;
+ tail->next = new;
+ rreq->buffer_tail = new;
+ added = netfs_load_buffer_from_ra(rreq, new);
+ rreq->iter.count += added;
+ rreq->submitted += added;
+ }
+ }
+
+ subreq->len = rsize;
+ if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
+ size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize,
+ rreq->io_streams[0].sreq_max_segs);
+
+ if (limit < rsize) {
+ subreq->len = limit;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
+
+ subreq->io_iter = rreq->iter;
+
+ if (iov_iter_is_folioq(&subreq->io_iter)) {
+ if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) {
+ subreq->io_iter.folioq = subreq->io_iter.folioq->next;
+ subreq->io_iter.folioq_slot = 0;
+ }
+ subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq;
+ subreq->curr_folioq_slot = subreq->io_iter.folioq_slot;
+ subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
+ }
+
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(&rreq->iter, subreq->len);
+ return subreq->len;
+}
+
+static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq,
+ loff_t i_size)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (!cres->ops)
+ return NETFS_DOWNLOAD_FROM_SERVER;
+ return cres->ops->prepare_read(subreq, i_size);
+}
+
+static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+
+ if (transferred_or_error < 0) {
+ netfs_read_subreq_terminated(subreq, transferred_or_error, was_async);
+ return;
+ }
+
+ if (transferred_or_error > 0)
+ subreq->transferred += transferred_or_error;
+ netfs_read_subreq_terminated(subreq, 0, was_async);
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ netfs_stat(&netfs_n_rh_read);
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE,
+ netfs_cache_read_terminated, subreq);
+}
+
+/*
+ * Perform a read to the pagecache from a series of sources of different types,
+ * slicing up the region to be read according to available cache blocks and
+ * network rsize.
+ */
+static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+{
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
+ unsigned long long start = rreq->start;
+ ssize_t size = rreq->len;
+ int ret = 0;
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ do {
+ struct netfs_io_subrequest *subreq;
+ enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+ ssize_t slice;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ subreq->start = start;
+ subreq->len = size;
+
+ atomic_inc(&rreq->nr_outstanding);
+ spin_lock_bh(&rreq->lock);
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+ subreq->prev_donated = rreq->prev_donated;
+ rreq->prev_donated = 0;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_added);
+ spin_unlock_bh(&rreq->lock);
+
+ source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
+ subreq->source = source;
+ if (source == NETFS_DOWNLOAD_FROM_SERVER) {
+ unsigned long long zp = umin(ictx->zero_point, rreq->i_size);
+ size_t len = subreq->len;
+
+ if (subreq->start >= zp) {
+ subreq->source = source = NETFS_FILL_WITH_ZEROES;
+ goto fill_with_zeroes;
+ }
+
+ if (len > zp - subreq->start)
+ len = zp - subreq->start;
+ if (len == 0) {
+ pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx",
+ rreq->debug_id, subreq->debug_index,
+ subreq->len, size,
+ subreq->start, ictx->zero_point, rreq->i_size);
+ break;
+ }
+ subreq->len = len;
+
+ netfs_stat(&netfs_n_rh_download);
+ if (rreq->netfs_ops->prepare_read) {
+ ret = rreq->netfs_ops->prepare_read(subreq);
+ if (ret < 0) {
+ atomic_dec(&rreq->nr_outstanding);
+ netfs_put_subrequest(subreq, false,
+ netfs_sreq_trace_put_cancel);
+ break;
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+
+ slice = netfs_prepare_read_iterator(subreq);
+ if (slice < 0) {
+ atomic_dec(&rreq->nr_outstanding);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ ret = slice;
+ break;
+ }
+
+ rreq->netfs_ops->issue_read(subreq);
+ goto done;
+ }
+
+ fill_with_zeroes:
+ if (source == NETFS_FILL_WITH_ZEROES) {
+ subreq->source = NETFS_FILL_WITH_ZEROES;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ netfs_stat(&netfs_n_rh_zero);
+ slice = netfs_prepare_read_iterator(subreq);
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_read_subreq_terminated(subreq, 0, false);
+ goto done;
+ }
+
+ if (source == NETFS_READ_FROM_CACHE) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ slice = netfs_prepare_read_iterator(subreq);
+ netfs_read_cache_to_pagecache(rreq, subreq);
+ goto done;
+ }
+
+ pr_err("Unexpected read source %u\n", source);
+ WARN_ON_ONCE(1);
+ break;
+
+ done:
+ size -= slice;
+ start += slice;
+ cond_resched();
+ } while (size > 0);
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ netfs_rreq_terminated(rreq, false);
+
+ /* Defer error return as we may need to wait for outstanding I/O. */
+ cmpxchg(&rreq->error, 0, ret);
+}
+
+/*
+ * Wait for the read operation to complete, successfully or otherwise.
+ */
+static int netfs_wait_for_read(struct netfs_io_request *rreq)
+{
+ int ret;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
+ ret = rreq->error;
+ if (ret == 0 && rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+ ret = -EIO;
+ }
+
+ return ret;
+}
+
+/*
+ * Set up the initial folioq of buffer folios in the rolling buffer and set the
+ * iterator to refer to it.
+ */
+static int netfs_prime_buffer(struct netfs_io_request *rreq)
+{
+ struct folio_queue *folioq;
+ size_t added;
+
+ folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
+ if (!folioq)
+ return -ENOMEM;
+ netfs_stat(&netfs_n_folioq);
+ folioq_init(folioq);
+ rreq->buffer = folioq;
+ rreq->buffer_tail = folioq;
+ rreq->submitted = rreq->start;
+ iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
+
+ added = netfs_load_buffer_from_ra(rreq, folioq);
+ rreq->iter.count += added;
+ rreq->submitted += added;
+ return 0;
+}
+
+/*
+ * Drop the ref on each folio that we inherited from the VM readahead code. We
+ * still have the folio locks to pin the page until we complete the I/O.
+ *
+ * Note that we can't just release the batch in each queue struct as we use the
+ * occupancy count in other places.
+ */
+static void netfs_put_ra_refs(struct folio_queue *folioq)
+{
+ struct folio_batch fbatch;
+
+ folio_batch_init(&fbatch);
+ while (folioq) {
+ for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) {
+ struct folio *folio = folioq_folio(folioq, slot);
+ if (!folio)
+ continue;
+ trace_netfs_folio(folio, netfs_folio_trace_read_put);
+ if (!folio_batch_add(&fbatch, folio))
+ folio_batch_release(&fbatch);
+ }
+ folioq = folioq->next;
+ }
+
+ folio_batch_release(&fbatch);
+}
+
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -201,22 +411,17 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
void netfs_readahead(struct readahead_control *ractl)
{
struct netfs_io_request *rreq;
- struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
+ struct netfs_inode *ictx = netfs_inode(ractl->mapping->host);
+ unsigned long long start = readahead_pos(ractl);
+ size_t size = readahead_length(ractl);
int ret;
- _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
-
- if (readahead_count(ractl) == 0)
- return;
-
- rreq = netfs_alloc_request(ractl->mapping, ractl->file,
- readahead_pos(ractl),
- readahead_length(ractl),
+ rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size,
NETFS_READAHEAD);
if (IS_ERR(rreq))
return;
- ret = netfs_begin_cache_read(rreq, ctx);
+ ret = netfs_begin_cache_read(rreq, ictx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto cleanup_free;
@@ -226,18 +431,15 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
- /* Set up the output buffer */
- iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
- rreq->start, rreq->len);
+ rreq->ractl = ractl;
+ if (netfs_prime_buffer(rreq) < 0)
+ goto cleanup_free;
+ netfs_read_to_pagecache(rreq);
- /* Drop the refs on the folios here rather than in the cache or
- * filesystem. The locks will be dropped in netfs_rreq_unlock().
- */
- while (readahead_folio(ractl))
- ;
+ /* Release the folio refs whilst we're waiting for the I/O. */
+ netfs_put_ra_refs(rreq->buffer);
- netfs_begin_read(rreq, false);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
return;
cleanup_free:
@@ -246,6 +448,117 @@ cleanup_free:
}
EXPORT_SYMBOL(netfs_readahead);
+/*
+ * Create a rolling buffer with a single occupying folio.
+ */
+static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio)
+{
+ struct folio_queue *folioq;
+
+ folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
+ if (!folioq)
+ return -ENOMEM;
+
+ netfs_stat(&netfs_n_folioq);
+ folioq_init(folioq);
+ folioq_append(folioq, folio);
+ BUG_ON(folioq_folio(folioq, 0) != folio);
+ BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio));
+ rreq->buffer = folioq;
+ rreq->buffer_tail = folioq;
+ rreq->submitted = rreq->start + rreq->len;
+ iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len);
+ rreq->ractl = (struct readahead_control *)1UL;
+ return 0;
+}
+
+/*
+ * Read into gaps in a folio partially filled by a streaming write.
+ */
+static int netfs_read_gaps(struct file *file, struct folio *folio)
+{
+ struct netfs_io_request *rreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ struct folio *sink = NULL;
+ struct bio_vec *bvec;
+ unsigned int from = finfo->dirty_offset;
+ unsigned int to = from + finfo->dirty_len;
+ unsigned int off = 0, i = 0;
+ size_t flen = folio_size(folio);
+ size_t nr_bvec = flen / PAGE_SIZE + 2;
+ size_t part;
+ int ret;
+
+ _enter("%lx", folio->index);
+
+ rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto alloc_error;
+ }
+
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
+
+ netfs_stat(&netfs_n_rh_read_folio);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps);
+
+ /* Fiddle the buffer so that a gap at the beginning and/or a gap at the
+ * end get copied to, but the middle is discarded.
+ */
+ ret = -ENOMEM;
+ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ goto discard;
+
+ sink = folio_alloc(GFP_KERNEL, 0);
+ if (!sink) {
+ kfree(bvec);
+ goto discard;
+ }
+
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+
+ rreq->direct_bv = bvec;
+ rreq->direct_bv_count = nr_bvec;
+ if (from > 0) {
+ bvec_set_folio(&bvec[i++], folio, from, 0);
+ off = from;
+ }
+ while (off < to) {
+ part = min_t(size_t, to - off, PAGE_SIZE);
+ bvec_set_folio(&bvec[i++], sink, part, 0);
+ off += part;
+ }
+ if (to < flen)
+ bvec_set_folio(&bvec[i++], folio, flen - to, to);
+ iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
+ rreq->submitted = rreq->start + flen;
+
+ netfs_read_to_pagecache(rreq);
+
+ if (sink)
+ folio_put(sink);
+
+ ret = netfs_wait_for_read(rreq);
+ if (ret == 0) {
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ }
+ folio_unlock(folio);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret < 0 ? ret : 0;
+
+discard:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+alloc_error:
+ folio_unlock(folio);
+ return ret;
+}
+
/**
* netfs_read_folio - Helper to manage a read_folio request
* @file: The file to read from
@@ -265,9 +578,13 @@ int netfs_read_folio(struct file *file, struct folio *folio)
struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
- struct folio *sink = NULL;
int ret;
+ if (folio_test_dirty(folio)) {
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+ return netfs_read_gaps(file, folio);
+ }
+
_enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
@@ -286,54 +603,12 @@ int netfs_read_folio(struct file *file, struct folio *folio)
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */
- if (folio_test_dirty(folio)) {
- /* Handle someone trying to read from an unflushed streaming
- * write. We fiddle the buffer so that a gap at the beginning
- * and/or a gap at the end get copied to, but the middle is
- * discarded.
- */
- struct netfs_folio *finfo = netfs_folio_info(folio);
- struct bio_vec *bvec;
- unsigned int from = finfo->dirty_offset;
- unsigned int to = from + finfo->dirty_len;
- unsigned int off = 0, i = 0;
- size_t flen = folio_size(folio);
- size_t nr_bvec = flen / PAGE_SIZE + 2;
- size_t part;
-
- ret = -ENOMEM;
- bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
- if (!bvec)
- goto discard;
-
- sink = folio_alloc(GFP_KERNEL, 0);
- if (!sink)
- goto discard;
-
- trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
-
- rreq->direct_bv = bvec;
- rreq->direct_bv_count = nr_bvec;
- if (from > 0) {
- bvec_set_folio(&bvec[i++], folio, from, 0);
- off = from;
- }
- while (off < to) {
- part = min_t(size_t, to - off, PAGE_SIZE);
- bvec_set_folio(&bvec[i++], sink, part, 0);
- off += part;
- }
- if (to < flen)
- bvec_set_folio(&bvec[i++], folio, flen - to, to);
- iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
- } else {
- iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
- rreq->start, rreq->len);
- }
+ ret = netfs_create_singular_buffer(rreq, folio);
+ if (ret < 0)
+ goto discard;
- ret = netfs_begin_read(rreq, true);
- if (sink)
- folio_put(sink);
+ netfs_read_to_pagecache(rreq);
+ ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
@@ -395,7 +670,7 @@ zero_out:
}
/**
- * netfs_write_begin - Helper to prepare for writing
+ * netfs_write_begin - Helper to prepare for writing [DEPRECATED]
* @ctx: The netfs context
* @file: The file to read from
* @mapping: The mapping to read from
@@ -406,13 +681,10 @@ zero_out:
*
* Pre-read data for a write-begin request by drawing data from the cache if
* possible, or the netfs if not. Space beyond the EOF is zero-filled.
- * Multiple I/O requests from different sources will get munged together. If
- * necessary, the readahead window can be expanded in either direction to a
- * more convenient alighment for RPC efficiency or to make storage in the cache
- * feasible.
+ * Multiple I/O requests from different sources will get munged together.
*
* The calling netfs must provide a table of operations, only one of which,
- * issue_op, is mandatory.
+ * issue_read, is mandatory.
*
* The check_write_begin() operation can be provided to check for and flush
* conflicting writes once the folio is grabbed and locked. It is passed a
@@ -426,6 +698,9 @@ zero_out:
* inode before calling this.
*
* This is usable whether or not caching is enabled.
+ *
+ * Note that this should be considered deprecated and netfs_perform_write()
+ * used instead.
*/
int netfs_write_begin(struct netfs_inode *ctx,
struct file *file, struct address_space *mapping,
@@ -437,8 +712,6 @@ int netfs_write_begin(struct netfs_inode *ctx,
pgoff_t index = pos >> PAGE_SHIFT;
int ret;
- DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
-
retry:
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
@@ -466,7 +739,7 @@ retry:
if (!netfs_is_cache_enabled(ctx) &&
netfs_skip_folio_read(folio, pos, len, false)) {
netfs_stat(&netfs_n_rh_write_zskip);
- goto have_folio;
+ goto have_folio_no_wait;
}
rreq = netfs_alloc_request(mapping, file,
@@ -486,27 +759,22 @@ retry:
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
- /* Expand the request to meet caching requirements and download
- * preferences.
- */
- ractl._nr_pages = folio_nr_pages(folio);
- netfs_rreq_expand(rreq, &ractl);
-
/* Set up the output buffer */
- iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
- rreq->start, rreq->len);
-
- /* We hold the folio locks, so we can drop the references */
- folio_get(folio);
- while (readahead_folio(&ractl))
- ;
+ ret = netfs_create_singular_buffer(rreq, folio);
+ if (ret < 0)
+ goto error_put;
- ret = netfs_begin_read(rreq, true);
+ netfs_read_to_pagecache(rreq);
+ ret = netfs_wait_for_read(rreq);
if (ret < 0)
goto error;
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
+ ret = folio_wait_private_2_killable(folio);
+ if (ret < 0)
+ goto error;
+have_folio_no_wait:
*_folio = folio;
_leave(" = 0");
return 0;
@@ -557,10 +825,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
/* Set up the output buffer */
- iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
- rreq->start, rreq->len);
+ ret = netfs_create_singular_buffer(rreq, folio);
+ if (ret < 0)
+ goto error_put;
- ret = netfs_begin_read(rreq, true);
+ folioq_mark2(rreq->buffer, 0);
+ netfs_read_to_pagecache(rreq);
+ ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 4726c315453c..d7eae597e54d 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -13,91 +13,22 @@
#include <linux/pagevec.h>
#include "internal.h"
-/*
- * Determined write method. Adjust netfs_folio_traces if this is changed.
- */
-enum netfs_how_to_modify {
- NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
- NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
- NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
- NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
- NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
- NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
- NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
-};
-
-static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
- void *priv = folio_get_private(folio);
-
- if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
+ if (netfs_group)
folio_attach_private(folio, netfs_get_group(netfs_group));
- else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
- folio_detach_private(folio);
}
-/*
- * Decide how we should modify a folio. We might be attempting to do
- * write-streaming, in which case we don't want to a local RMW cycle if we can
- * avoid it. If we're doing local caching or content crypto, we award that
- * priority over avoiding RMW. If the file is open readably, then we also
- * assume that we may want to read what we wrote.
- */
-static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
- struct file *file,
- struct folio *folio,
- void *netfs_group,
- size_t flen,
- size_t offset,
- size_t len,
- bool maybe_trouble)
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
{
- struct netfs_folio *finfo = netfs_folio_info(folio);
- struct netfs_group *group = netfs_folio_group(folio);
- loff_t pos = folio_pos(folio);
-
- _enter("");
-
- if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
- return NETFS_FLUSH_CONTENT;
-
- if (folio_test_uptodate(folio))
- return NETFS_FOLIO_IS_UPTODATE;
-
- if (pos >= ctx->zero_point)
- return NETFS_MODIFY_AND_CLEAR;
-
- if (!maybe_trouble && offset == 0 && len >= flen)
- return NETFS_WHOLE_FOLIO_MODIFY;
-
- if (file->f_mode & FMODE_READ)
- goto no_write_streaming;
-
- if (netfs_is_cache_enabled(ctx)) {
- /* We don't want to get a streaming write on a file that loses
- * caching service temporarily because the backing store got
- * culled.
- */
- goto no_write_streaming;
- }
+ void *priv = folio_get_private(folio);
- if (!finfo)
- return NETFS_STREAMING_WRITE;
-
- /* We can continue a streaming write only if it continues on from the
- * previous. If it overlaps, we must flush lest we suffer a partial
- * copy and disjoint dirty regions.
- */
- if (offset == finfo->dirty_offset + finfo->dirty_len)
- return NETFS_STREAMING_WRITE_CONT;
- return NETFS_FLUSH_CONTENT;
-
-no_write_streaming:
- if (finfo) {
- netfs_stat(&netfs_n_wh_wstream_conflict);
- return NETFS_FLUSH_CONTENT;
+ if (unlikely(priv != netfs_group)) {
+ if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+ else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+ folio_detach_private(folio);
}
- return NETFS_JUST_PREFETCH;
}
/*
@@ -177,14 +108,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
.range_end = iocb->ki_pos + iter->count,
};
struct netfs_io_request *wreq = NULL;
- struct netfs_folio *finfo;
- struct folio *folio, *writethrough = NULL;
- enum netfs_how_to_modify howto;
- enum netfs_folio_trace trace;
+ struct folio *folio = NULL, *writethrough = NULL;
unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
ssize_t written = 0, ret, ret2;
- loff_t i_size, pos = iocb->ki_pos, from, to;
- size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+ loff_t i_size, pos = iocb->ki_pos;
+ size_t max_chunk = mapping_max_folio_size(mapping);
bool maybe_trouble = false;
if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
@@ -213,15 +141,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
do {
+ struct netfs_folio *finfo;
+ struct netfs_group *group;
+ unsigned long long fpos;
size_t flen;
size_t offset; /* Offset into pagecache folio */
size_t part; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
- if (unlikely(ret < 0))
- break;
-
offset = pos & (max_chunk - 1);
part = min(max_chunk - offset, iov_iter_count(iter));
@@ -247,7 +174,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
}
flen = folio_size(folio);
- offset = pos & (flen - 1);
+ fpos = folio_pos(folio);
+ offset = pos - fpos;
part = min_t(size_t, flen - offset, part);
/* Wait for writeback to complete. The writeback engine owns
@@ -265,71 +193,52 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
goto error_folio_unlock;
}
- /* See if we need to prefetch the area we're going to modify.
- * We need to do this before we get a lock on the folio in case
- * there's more than one writer competing for the same cache
- * block.
+ /* Decide how we should modify a folio. We might be attempting
+ * to do write-streaming, in which case we don't want to a
+ * local RMW cycle if we can avoid it. If we're doing local
+ * caching or content crypto, we award that priority over
+ * avoiding RMW. If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
*/
- howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
- flen, offset, part, maybe_trouble);
- _debug("howto %u", howto);
- switch (howto) {
- case NETFS_JUST_PREFETCH:
- ret = netfs_prefetch_for_write(file, folio, offset, part);
- if (ret < 0) {
- _debug("prefetch = %zd", ret);
- goto error_folio_unlock;
- }
- break;
- case NETFS_FOLIO_IS_UPTODATE:
- case NETFS_WHOLE_FOLIO_MODIFY:
- case NETFS_STREAMING_WRITE_CONT:
- break;
- case NETFS_MODIFY_AND_CLEAR:
- zero_user_segment(&folio->page, 0, offset);
- break;
- case NETFS_STREAMING_WRITE:
- ret = -EIO;
- if (WARN_ON(folio_get_private(folio)))
- goto error_folio_unlock;
- break;
- case NETFS_FLUSH_CONTENT:
- trace_netfs_folio(folio, netfs_flush_content);
- from = folio_pos(folio);
- to = from + folio_size(folio) - 1;
- folio_unlock(folio);
- folio_put(folio);
- ret = filemap_write_and_wait_range(mapping, from, to);
- if (ret < 0)
- goto error_folio_unlock;
- continue;
- }
-
- if (mapping_writably_mapped(mapping))
- flush_dcache_folio(folio);
-
- copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
-
- flush_dcache_folio(folio);
-
- /* Deal with a (partially) failed copy */
- if (copied == 0) {
- ret = -EFAULT;
- goto error_folio_unlock;
+ finfo = netfs_folio_info(folio);
+ group = netfs_folio_group(folio);
+
+ if (unlikely(group != netfs_group) &&
+ group != NETFS_FOLIO_COPY_TO_CACHE)
+ goto flush_content;
+
+ if (folio_test_uptodate(folio)) {
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
+ netfs_set_group(folio, netfs_group);
+ trace_netfs_folio(folio, netfs_folio_is_uptodate);
+ goto copied;
}
- trace = (enum netfs_folio_trace)howto;
- switch (howto) {
- case NETFS_FOLIO_IS_UPTODATE:
- case NETFS_JUST_PREFETCH:
- netfs_set_group(folio, netfs_group);
- break;
- case NETFS_MODIFY_AND_CLEAR:
+ /* If the page is above the zero-point then we assume that the
+ * server would just return a block of zeros or a short read if
+ * we try to read it.
+ */
+ if (fpos >= ctx->zero_point) {
+ zero_user_segment(&folio->page, 0, offset);
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
zero_user_segment(&folio->page, offset + copied, flen);
- netfs_set_group(folio, netfs_group);
+ __netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
- break;
- case NETFS_WHOLE_FOLIO_MODIFY:
+ trace_netfs_folio(folio, netfs_modify_and_clear);
+ goto copied;
+ }
+
+ /* See if we can write a whole folio in one go. */
+ if (!maybe_trouble && offset == 0 && part >= flen) {
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
if (unlikely(copied < part)) {
maybe_trouble = true;
iov_iter_revert(iter, copied);
@@ -337,16 +246,53 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_unlock(folio);
goto retry;
}
- netfs_set_group(folio, netfs_group);
+ __netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
- break;
- case NETFS_STREAMING_WRITE:
+ trace_netfs_folio(folio, netfs_whole_folio_modify);
+ goto copied;
+ }
+
+ /* We don't want to do a streaming write on a file that loses
+ * caching service temporarily because the backing store got
+ * culled and we don't really want to get a streaming write on
+ * a file that's open for reading as ->read_folio() then has to
+ * be able to flush it.
+ */
+ if ((file->f_mode & FMODE_READ) ||
+ netfs_is_cache_enabled(ctx)) {
+ if (finfo) {
+ netfs_stat(&netfs_n_wh_wstream_conflict);
+ goto flush_content;
+ }
+ ret = netfs_prefetch_for_write(file, folio, offset, part);
+ if (ret < 0) {
+ _debug("prefetch = %zd", ret);
+ goto error_folio_unlock;
+ }
+ /* Note that copy-to-cache may have been set. */
+
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
+ netfs_set_group(folio, netfs_group);
+ trace_netfs_folio(folio, netfs_just_prefetch);
+ goto copied;
+ }
+
+ if (!finfo) {
+ ret = -EIO;
+ if (WARN_ON(folio_get_private(folio)))
+ goto error_folio_unlock;
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
if (offset == 0 && copied == flen) {
- netfs_set_group(folio, netfs_group);
+ __netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
- trace = netfs_streaming_filled_page;
- break;
+ trace_netfs_folio(folio, netfs_streaming_filled_page);
+ goto copied;
}
+
finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
if (!finfo) {
iov_iter_revert(iter, copied);
@@ -358,9 +304,18 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
finfo->dirty_len = copied;
folio_attach_private(folio, (void *)((unsigned long)finfo |
NETFS_FOLIO_INFO));
- break;
- case NETFS_STREAMING_WRITE_CONT:
- finfo = netfs_folio_info(folio);
+ trace_netfs_folio(folio, netfs_streaming_write);
+ goto copied;
+ }
+
+ /* We can continue a streaming write only if it continues on
+ * from the previous. If it overlaps, we must flush lest we
+ * suffer a partial copy and disjoint dirty regions.
+ */
+ if (offset == finfo->dirty_offset + finfo->dirty_len) {
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+ if (unlikely(copied == 0))
+ goto copy_failed;
finfo->dirty_len += copied;
if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
if (finfo->netfs_group)
@@ -369,17 +324,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_detach_private(folio);
folio_mark_uptodate(folio);
kfree(finfo);
- trace = netfs_streaming_cont_filled_page;
+ trace_netfs_folio(folio, netfs_streaming_cont_filled_page);
+ } else {
+ trace_netfs_folio(folio, netfs_streaming_write_cont);
}
- break;
- default:
- WARN(true, "Unexpected modify type %u ix=%lx\n",
- howto, folio->index);
- ret = -EIO;
- goto error_folio_unlock;
+ goto copied;
}
- trace_netfs_folio(folio, trace);
+ /* Incompatible write; flush the folio and try again. */
+ flush_content:
+ trace_netfs_folio(folio, netfs_flush_content);
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
+ if (ret < 0)
+ goto error_folio_unlock;
+ continue;
+
+ copied:
+ flush_dcache_folio(folio);
/* Update the inode size if we moved the EOF marker */
pos += copied;
@@ -401,12 +364,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
folio_put(folio);
folio = NULL;
+ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+ if (unlikely(ret < 0))
+ break;
+
cond_resched();
} while (iov_iter_count(iter));
out:
- if (likely(written) && ctx->ops->post_modify)
- ctx->ops->post_modify(inode);
+ if (likely(written)) {
+ /* Set indication that ctime and mtime got updated in case
+ * close is deferred.
+ */
+ set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags);
+ if (unlikely(ctx->ops->post_modify))
+ ctx->ops->post_modify(inode);
+ }
if (unlikely(wreq)) {
ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
@@ -421,6 +394,8 @@ out:
_leave(" = %zd [%zd]", written, ret);
return written ? written : ret;
+copy_failed:
+ ret = -EFAULT;
error_folio_unlock:
folio_unlock(folio);
folio_put(folio);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 10a1e4da6bda..b1a66a6e6bc2 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -16,6 +16,143 @@
#include <linux/netfs.h>
#include "internal.h"
+static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ size_t rsize;
+
+ rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len);
+ subreq->len = rsize;
+
+ if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
+ size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize,
+ rreq->io_streams[0].sreq_max_segs);
+
+ if (limit < rsize) {
+ subreq->len = limit;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ subreq->io_iter = rreq->iter;
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(&rreq->iter, subreq->len);
+}
+
+/*
+ * Perform a read to a buffer from the server, slicing up the region to be read
+ * according to the network rsize.
+ */
+static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
+{
+ unsigned long long start = rreq->start;
+ ssize_t size = rreq->len;
+ int ret = 0;
+
+ atomic_set(&rreq->nr_outstanding, 1);
+
+ do {
+ struct netfs_io_subrequest *subreq;
+ ssize_t slice;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->start = start;
+ subreq->len = size;
+
+ atomic_inc(&rreq->nr_outstanding);
+ spin_lock_bh(&rreq->lock);
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+ subreq->prev_donated = rreq->prev_donated;
+ rreq->prev_donated = 0;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_added);
+ spin_unlock_bh(&rreq->lock);
+
+ netfs_stat(&netfs_n_rh_download);
+ if (rreq->netfs_ops->prepare_read) {
+ ret = rreq->netfs_ops->prepare_read(subreq);
+ if (ret < 0) {
+ atomic_dec(&rreq->nr_outstanding);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ break;
+ }
+ }
+
+ netfs_prepare_dio_read_iterator(subreq);
+ slice = subreq->len;
+ rreq->netfs_ops->issue_read(subreq);
+
+ size -= slice;
+ start += slice;
+ rreq->submitted += slice;
+
+ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
+ break;
+ cond_resched();
+ } while (size > 0);
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ netfs_rreq_terminated(rreq, false);
+ return ret;
+}
+
+/*
+ * Perform a read to an application buffer, bypassing the pagecache and the
+ * local disk cache.
+ */
+static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
+{
+ int ret;
+
+ _enter("R=%x %llx-%llx",
+ rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
+
+ if (rreq->len == 0) {
+ pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
+ return -EIO;
+ }
+
+ // TODO: Use bounce buffer if requested
+
+ inode_dio_begin(rreq->inode);
+
+ ret = netfs_dispatch_unbuffered_reads(rreq);
+
+ if (!rreq->submitted) {
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ inode_dio_end(rreq->inode);
+ ret = 0;
+ goto out;
+ }
+
+ if (sync) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+
+ ret = rreq->error;
+ if (ret == 0 && rreq->submitted < rreq->len &&
+ rreq->origin != NETFS_DIO_READ) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+ ret = -EIO;
+ }
+ } else {
+ ret = -EIOCBQUEUED;
+ }
+
+out:
+ _leave(" = %d", ret);
+ return ret;
+}
+
/**
* netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
* @iocb: The I/O control descriptor describing the read
@@ -31,7 +168,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
struct netfs_io_request *rreq;
ssize_t ret;
size_t orig_count = iov_iter_count(iter);
- bool async = !is_sync_kiocb(iocb);
+ bool sync = is_sync_kiocb(iocb);
_enter("");
@@ -78,13 +215,13 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
// TODO: Set up bounce buffer if needed
- if (async)
+ if (!sync)
rreq->iocb = iocb;
- ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+ ret = netfs_unbuffered_read(rreq, sync);
if (ret < 0)
goto out; /* May be -EIOCBQUEUED */
- if (!async) {
+ if (sync) {
// TODO: Copy from bounce buffer
iocb->ki_pos += rreq->transferred;
ret = rreq->transferred;
@@ -94,8 +231,6 @@ out:
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
if (ret > 0)
orig_count -= ret;
- if (ret != -EIOCBQUEUED)
- iov_iter_revert(iter, orig_count - iov_iter_count(iter));
return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked);
diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..d4d4b3a8b106 100644
--- a/fs/netfs/fscache_cookie.c
+++ b/fs/netfs/fscache_cookie.c
@@ -741,6 +741,10 @@ again_locked:
spin_lock(&cookie->lock);
}
if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) {
+ if (atomic_read(&cookie->n_accesses) != 0)
+ /* still being accessed: postpone it */
+ break;
+
__fscache_set_cookie_state(cookie,
FSCACHE_COOKIE_STATE_LRU_DISCARDING);
wake = true;
diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c
index 42e98bb523e3..49849005eb7c 100644
--- a/fs/netfs/fscache_main.c
+++ b/fs/netfs/fscache_main.c
@@ -103,6 +103,7 @@ void __exit fscache_exit(void)
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
+ timer_shutdown_sync(&fscache_cookie_lru_timer);
destroy_workqueue(fscache_wq);
pr_notice("FS-Cache unloaded\n");
}
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 7773f3d855a9..c9f0ed24cb7b 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/seq_file.h>
+#include <linux/folio_queue.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
#include <linux/fscache-cache.h>
@@ -22,16 +23,10 @@
/*
* buffered_read.c
*/
-void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len);
/*
- * io.c
- */
-int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
-
-/*
* main.c
*/
extern unsigned int netfs_debug;
@@ -63,6 +58,11 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
+int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
+ bool needs_put);
+struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq);
+void netfs_clear_buffer(struct netfs_io_request *rreq);
+void netfs_reset_iter(struct netfs_io_subrequest *subreq);
/*
* objects.c
@@ -84,6 +84,28 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
+ * read_collect.c
+ */
+void netfs_read_termination_worker(struct work_struct *work);
+void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async);
+
+/*
+ * read_pgpriv2.c
+ */
+void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
+ struct netfs_io_request *rreq,
+ struct folio_queue *folioq,
+ int slot);
+void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq);
+bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq);
+
+/*
+ * read_retry.c
+ */
+void netfs_retry_reads(struct netfs_io_request *rreq);
+void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq);
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
@@ -110,6 +132,7 @@ extern atomic_t netfs_n_wh_buffered_write;
extern atomic_t netfs_n_wh_writethrough;
extern atomic_t netfs_n_wh_dio_write;
extern atomic_t netfs_n_wh_writepages;
+extern atomic_t netfs_n_wh_copy_to_cache;
extern atomic_t netfs_n_wh_wstream_conflict;
extern atomic_t netfs_n_wh_upload;
extern atomic_t netfs_n_wh_upload_done;
@@ -117,6 +140,9 @@ extern atomic_t netfs_n_wh_upload_failed;
extern atomic_t netfs_n_wh_write;
extern atomic_t netfs_n_wh_write_done;
extern atomic_t netfs_n_wh_write_failed;
+extern atomic_t netfs_n_wb_lock_skip;
+extern atomic_t netfs_n_wb_lock_wait;
+extern atomic_t netfs_n_folioq;
int netfs_stats_show(struct seq_file *m, void *v);
@@ -150,7 +176,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
loff_t start,
enum netfs_io_origin origin);
void netfs_reissue_write(struct netfs_io_stream *stream,
- struct netfs_io_subrequest *subreq);
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *source);
+void netfs_issue_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream);
int netfs_advance_write(struct netfs_io_request *wreq,
struct netfs_io_stream *stream,
loff_t start, size_t len, bool to_eof);
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
deleted file mode 100644
index c93851b98368..000000000000
--- a/fs/netfs/io.c
+++ /dev/null
@@ -1,647 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* Network filesystem high-level read support.
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include <linux/module.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uio.h>
-#include <linux/sched/mm.h>
-#include <linux/task_io_accounting_ops.h>
-#include "internal.h"
-
-/*
- * Clear the unread part of an I/O request.
- */
-static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
-{
- iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
-}
-
-static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_subrequest *subreq = priv;
-
- netfs_subreq_terminated(subreq, transferred_or_error, was_async);
-}
-
-/*
- * Issue a read against the cache.
- * - Eats the caller's ref on subreq.
- */
-static void netfs_read_from_cache(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq,
- enum netfs_read_from_hole read_hole)
-{
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- netfs_stat(&netfs_n_rh_read);
- cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
- netfs_cache_read_terminated, subreq);
-}
-
-/*
- * Fill a subrequest region with zeroes.
- */
-static void netfs_fill_with_zeroes(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_zero);
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
- netfs_subreq_terminated(subreq, 0, false);
-}
-
-/*
- * Ask the netfs to issue a read request to the server for us.
- *
- * The netfs is expected to read from subreq->pos + subreq->transferred to
- * subreq->pos + subreq->len - 1. It may not backtrack and write data into the
- * buffer prior to the transferred point as it might clobber dirty data
- * obtained from the cache.
- *
- * Alternatively, the netfs is allowed to indicate one of two things:
- *
- * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
- * make progress.
- *
- * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
- * cleared.
- */
-static void netfs_read_from_server(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- netfs_stat(&netfs_n_rh_download);
-
- if (rreq->origin != NETFS_DIO_READ &&
- iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
- pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
- rreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->len,
- subreq->transferred, subreq->flags);
- rreq->netfs_ops->issue_read(subreq);
-}
-
-/*
- * Release those waiting.
- */
-static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_done);
- netfs_clear_subrequests(rreq, was_async);
- netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete);
-}
-
-/*
- * Handle a short read.
- */
-static void netfs_rreq_short_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
-
- netfs_stat(&netfs_n_rh_short_read);
- trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
-
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read);
- atomic_inc(&rreq->nr_outstanding);
- if (subreq->source == NETFS_READ_FROM_CACHE)
- netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR);
- else
- netfs_read_from_server(rreq, subreq);
-}
-
-/*
- * Reset the subrequest iterator prior to resubmission.
- */
-static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
-{
- size_t remaining = subreq->len - subreq->transferred;
- size_t count = iov_iter_count(&subreq->io_iter);
-
- if (count == remaining)
- return;
-
- _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
- rreq->debug_id, subreq->debug_index,
- iov_iter_count(&subreq->io_iter), subreq->transferred,
- subreq->len, rreq->i_size,
- subreq->io_iter.iter_type);
-
- if (count < remaining)
- iov_iter_revert(&subreq->io_iter, remaining - count);
- else
- iov_iter_advance(&subreq->io_iter, count - remaining);
-}
-
-/*
- * Resubmit any short or failed operations. Returns true if we got the rreq
- * ref back.
- */
-static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
-
- WARN_ON(in_interrupt());
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
-
- /* We don't want terminating submissions trying to wake us up whilst
- * we're still going through the list.
- */
- atomic_inc(&rreq->nr_outstanding);
-
- __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->error) {
- if (subreq->source != NETFS_READ_FROM_CACHE)
- break;
- subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
- subreq->error = 0;
- netfs_stat(&netfs_n_rh_download_instead);
- trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
- atomic_inc(&rreq->nr_outstanding);
- netfs_reset_subreq_iter(rreq, subreq);
- netfs_read_from_server(rreq, subreq);
- } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
- netfs_rreq_short_read(rreq, subreq);
- }
- }
-
- /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- return true;
-
- wake_up_var(&rreq->nr_outstanding);
- return false;
-}
-
-/*
- * Check to see if the data read is still valid.
- */
-static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
-
- if (!rreq->netfs_ops->is_still_valid ||
- rreq->netfs_ops->is_still_valid(rreq))
- return;
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- subreq->error = -ESTALE;
- __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- }
- }
-}
-
-/*
- * Determine how much we can admit to having read from a DIO read.
- */
-static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- unsigned int i;
- size_t transferred = 0;
-
- for (i = 0; i < rreq->direct_bv_count; i++) {
- flush_dcache_page(rreq->direct_bv[i].bv_page);
- // TODO: cifs marks pages in the destination buffer
- // dirty under some circumstances after a read. Do we
- // need to do that too?
- set_page_dirty(rreq->direct_bv[i].bv_page);
- }
-
- list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
- if (subreq->error || subreq->transferred == 0)
- break;
- transferred += subreq->transferred;
- if (subreq->transferred < subreq->len)
- break;
- }
-
- for (i = 0; i < rreq->direct_bv_count; i++)
- flush_dcache_page(rreq->direct_bv[i].bv_page);
-
- rreq->transferred = transferred;
- task_io_account_read(transferred);
-
- if (rreq->iocb) {
- rreq->iocb->ki_pos += transferred;
- if (rreq->iocb->ki_complete)
- rreq->iocb->ki_complete(
- rreq->iocb, rreq->error ? rreq->error : transferred);
- }
- if (rreq->netfs_ops->done)
- rreq->netfs_ops->done(rreq);
- inode_dio_end(rreq->inode);
-}
-
-/*
- * Assess the state of a read request and decide what to do next.
- *
- * Note that we could be in an ordinary kernel thread, on a workqueue or in
- * softirq context at this point. We inherit a ref from the caller.
- */
-static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async)
-{
- trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
-
-again:
- netfs_rreq_is_still_valid(rreq);
-
- if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
- test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
- if (netfs_rreq_perform_resubmissions(rreq))
- goto again;
- return;
- }
-
- if (rreq->origin != NETFS_DIO_READ)
- netfs_rreq_unlock_folios(rreq);
- else
- netfs_rreq_assess_dio(rreq);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
- clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
-
- netfs_rreq_completed(rreq, was_async);
-}
-
-static void netfs_rreq_work(struct work_struct *work)
-{
- struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
- netfs_rreq_assess(rreq, false);
-}
-
-/*
- * Handle the completion of all outstanding I/O operations on a read request.
- * We inherit a ref from the caller.
- */
-static void netfs_rreq_terminated(struct netfs_io_request *rreq,
- bool was_async)
-{
- if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
- was_async) {
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_rreq_assess(rreq, was_async);
- }
-}
-
-/**
- * netfs_subreq_terminated - Note the termination of an I/O operation.
- * @subreq: The I/O request that has terminated.
- * @transferred_or_error: The amount of data transferred or an error code.
- * @was_async: The termination was asynchronous
- *
- * This tells the read helper that a contributory I/O operation has terminated,
- * one way or another, and that it should integrate the results.
- *
- * The caller indicates in @transferred_or_error the outcome of the operation,
- * supplying a positive value to indicate the number of bytes transferred, 0 to
- * indicate a failure to transfer anything that should be retried or a negative
- * error code. The helper will look after reissuing I/O operations as
- * appropriate and writing downloaded data to the cache.
- *
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
- */
-void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
- ssize_t transferred_or_error,
- bool was_async)
-{
- struct netfs_io_request *rreq = subreq->rreq;
- int u;
-
- _enter("R=%x[%x]{%llx,%lx},%zd",
- rreq->debug_id, subreq->debug_index,
- subreq->start, subreq->flags, transferred_or_error);
-
- switch (subreq->source) {
- case NETFS_READ_FROM_CACHE:
- netfs_stat(&netfs_n_rh_read_done);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_stat(&netfs_n_rh_download_done);
- break;
- default:
- break;
- }
-
- if (IS_ERR_VALUE(transferred_or_error)) {
- subreq->error = transferred_or_error;
- trace_netfs_failure(rreq, subreq, transferred_or_error,
- netfs_fail_read);
- goto failed;
- }
-
- if (WARN(transferred_or_error > subreq->len - subreq->transferred,
- "Subreq overread: R%x[%x] %zd > %zu - %zu",
- rreq->debug_id, subreq->debug_index,
- transferred_or_error, subreq->len, subreq->transferred))
- transferred_or_error = subreq->len - subreq->transferred;
-
- subreq->error = 0;
- subreq->transferred += transferred_or_error;
- if (subreq->transferred < subreq->len)
- goto incomplete;
-
-complete:
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
- set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
-
-out:
- trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- u = atomic_dec_return(&rreq->nr_outstanding);
- if (u == 0)
- netfs_rreq_terminated(rreq, was_async);
- else if (u == 1)
- wake_up_var(&rreq->nr_outstanding);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
- return;
-
-incomplete:
- if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
- netfs_clear_unread(subreq);
- subreq->transferred = subreq->len;
- goto complete;
- }
-
- if (transferred_or_error == 0) {
- if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
- subreq->error = -ENODATA;
- goto failed;
- }
- } else {
- __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
- }
-
- __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- goto out;
-
-failed:
- if (subreq->source == NETFS_READ_FROM_CACHE) {
- netfs_stat(&netfs_n_rh_read_failed);
- set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
- } else {
- netfs_stat(&netfs_n_rh_download_failed);
- set_bit(NETFS_RREQ_FAILED, &rreq->flags);
- rreq->error = subreq->error;
- }
- goto out;
-}
-EXPORT_SYMBOL(netfs_subreq_terminated);
-
-static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq,
- loff_t i_size)
-{
- struct netfs_io_request *rreq = subreq->rreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
-
- if (cres->ops)
- return cres->ops->prepare_read(subreq, i_size);
- if (subreq->start >= rreq->i_size)
- return NETFS_FILL_WITH_ZEROES;
- return NETFS_DOWNLOAD_FROM_SERVER;
-}
-
-/*
- * Work out what sort of subrequest the next one will be.
- */
-static enum netfs_io_source
-netfs_rreq_prepare_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq,
- struct iov_iter *io_iter)
-{
- enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
- struct netfs_inode *ictx = netfs_inode(rreq->inode);
- size_t lsize;
-
- _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
-
- if (rreq->origin != NETFS_DIO_READ) {
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
- }
-
- if (source == NETFS_DOWNLOAD_FROM_SERVER) {
- /* Call out to the netfs to let it shrink the request to fit
- * its own I/O sizes and boundaries. If it shinks it here, it
- * will be called again to make simultaneous calls; if it wants
- * to make serial calls, it can indicate a short read and then
- * we will call it again.
- */
- if (rreq->origin != NETFS_DIO_READ) {
- if (subreq->start >= ictx->zero_point) {
- source = NETFS_FILL_WITH_ZEROES;
- goto set;
- }
- if (subreq->len > ictx->zero_point - subreq->start)
- subreq->len = ictx->zero_point - subreq->start;
- }
- if (subreq->len > rreq->i_size - subreq->start)
- subreq->len = rreq->i_size - subreq->start;
- if (rreq->rsize && subreq->len > rreq->rsize)
- subreq->len = rreq->rsize;
-
- if (rreq->netfs_ops->clamp_length &&
- !rreq->netfs_ops->clamp_length(subreq)) {
- source = NETFS_INVALID_READ;
- goto out;
- }
-
- if (subreq->max_nr_segs) {
- lsize = netfs_limit_iter(io_iter, 0, subreq->len,
- subreq->max_nr_segs);
- if (subreq->len > lsize) {
- subreq->len = lsize;
- trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
- }
- }
- }
-
-set:
- if (subreq->len > rreq->len)
- pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n",
- rreq->debug_id, subreq->debug_index,
- subreq->len, rreq->len);
-
- if (WARN_ON(subreq->len == 0)) {
- source = NETFS_INVALID_READ;
- goto out;
- }
-
- subreq->source = source;
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
-
- subreq->io_iter = *io_iter;
- iov_iter_truncate(&subreq->io_iter, subreq->len);
- iov_iter_advance(io_iter, subreq->len);
-out:
- subreq->source = source;
- trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
- return source;
-}
-
-/*
- * Slice off a piece of a read request and submit an I/O request for it.
- */
-static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
- struct iov_iter *io_iter)
-{
- struct netfs_io_subrequest *subreq;
- enum netfs_io_source source;
-
- subreq = netfs_alloc_subrequest(rreq);
- if (!subreq)
- return false;
-
- subreq->start = rreq->start + rreq->submitted;
- subreq->len = io_iter->count;
-
- _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
- list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
- /* Call out to the cache to find out what it can do with the remaining
- * subset. It tells us in subreq->flags what it decided should be done
- * and adjusts subreq->len down if the subset crosses a cache boundary.
- *
- * Then when we hand the subset, it can choose to take a subset of that
- * (the starts must coincide), in which case, we go around the loop
- * again and ask it to download the next piece.
- */
- source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
- if (source == NETFS_INVALID_READ)
- goto subreq_failed;
-
- atomic_inc(&rreq->nr_outstanding);
-
- rreq->submitted += subreq->len;
-
- trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
- switch (source) {
- case NETFS_FILL_WITH_ZEROES:
- netfs_fill_with_zeroes(rreq, subreq);
- break;
- case NETFS_DOWNLOAD_FROM_SERVER:
- netfs_read_from_server(rreq, subreq);
- break;
- case NETFS_READ_FROM_CACHE:
- netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE);
- break;
- default:
- BUG();
- }
-
- return true;
-
-subreq_failed:
- rreq->error = subreq->error;
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed);
- return false;
-}
-
-/*
- * Begin the process of reading in a chunk of data, where that data may be
- * stitched together from multiple sources, including multiple servers and the
- * local cache.
- */
-int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
-{
- struct iov_iter io_iter;
- int ret;
-
- _enter("R=%x %llx-%llx",
- rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
-
- if (rreq->len == 0) {
- pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
- return -EIO;
- }
-
- if (rreq->origin == NETFS_DIO_READ)
- inode_dio_begin(rreq->inode);
-
- // TODO: Use bounce buffer if requested
- rreq->io_iter = rreq->iter;
-
- INIT_WORK(&rreq->work, netfs_rreq_work);
-
- /* Chop the read into slices according to what the cache and the netfs
- * want and submit each one.
- */
- netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
- atomic_set(&rreq->nr_outstanding, 1);
- io_iter = rreq->io_iter;
- do {
- _debug("submit %llx + %llx >= %llx",
- rreq->start, rreq->submitted, rreq->i_size);
- if (rreq->origin == NETFS_DIO_READ &&
- rreq->start + rreq->submitted >= rreq->i_size)
- break;
- if (!netfs_rreq_submit_slice(rreq, &io_iter))
- break;
- if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
- test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
- break;
-
- } while (rreq->submitted < rreq->len);
-
- if (!rreq->submitted) {
- netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
- if (rreq->origin == NETFS_DIO_READ)
- inode_dio_end(rreq->inode);
- ret = 0;
- goto out;
- }
-
- if (sync) {
- /* Keep nr_outstanding incremented so that the ref always
- * belongs to us, and the service code isn't punted off to a
- * random thread pool to process. Note that this might start
- * further work, such as writing to the cache.
- */
- wait_var_event(&rreq->nr_outstanding,
- atomic_read(&rreq->nr_outstanding) == 1);
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- netfs_rreq_assess(rreq, false);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
- wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
-
- ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len &&
- rreq->origin != NETFS_DIO_READ) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
- ret = -EIO;
- }
- } else {
- /* If we decrement nr_outstanding to 0, the ref belongs to us. */
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- netfs_rreq_assess(rreq, false);
- ret = -EIOCBQUEUED;
- }
-
-out:
- return ret;
-}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index b781bbbf1d8d..72a435e5fc6d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -188,9 +188,59 @@ static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offse
return min(span, max_size);
}
+/*
+ * Select the span of a folio queue iterator we're going to use. Limit it by
+ * both maximum size and maximum number of segments. Returns the size of the
+ * span in bytes.
+ */
+static size_t netfs_limit_folioq(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct folio_queue *folioq = iter->folioq;
+ unsigned int nsegs = 0;
+ unsigned int slot = iter->folioq_slot;
+ size_t span = 0, n = iter->count;
+
+ if (WARN_ON(!iov_iter_is_folioq(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+ max_size = umin(max_size, n - start_offset);
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = folioq->next;
+ slot = 0;
+ }
+
+ start_offset += iter->iov_offset;
+ do {
+ size_t flen = folioq_folio_size(folioq, slot);
+
+ if (start_offset < flen) {
+ span += flen - start_offset;
+ nsegs++;
+ start_offset = 0;
+ } else {
+ start_offset -= flen;
+ }
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = folioq->next;
+ slot = 0;
+ }
+ } while (folioq);
+
+ return umin(span, max_size);
+}
+
size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
size_t max_size, size_t max_segs)
{
+ if (iov_iter_is_folioq(iter))
+ return netfs_limit_folioq(iter, start_offset, max_size, max_segs);
if (iov_iter_is_bvec(iter))
return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
if (iov_iter_is_xarray(iter))
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
index 75dc52a49b3a..21eab56ee2f9 100644
--- a/fs/netfs/locking.c
+++ b/fs/netfs/locking.c
@@ -19,25 +19,13 @@
* Must be called under a lock that serializes taking new references
* to i_dio_count, usually by inode->i_mutex.
*/
-static int inode_dio_wait_interruptible(struct inode *inode)
+static int netfs_inode_dio_wait_interruptible(struct inode *inode)
{
- if (!atomic_read(&inode->i_dio_count))
+ if (inode_dio_finished(inode))
return 0;
- wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
- DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
-
- for (;;) {
- prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
- if (!atomic_read(&inode->i_dio_count))
- break;
- if (signal_pending(current))
- break;
- schedule();
- }
- finish_wait(wq, &q.wq_entry);
-
- return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+ inode_dio_wait_interruptible(inode);
+ return !inode_dio_finished(inode) ? -ERESTARTSYS : 0;
}
/* Call with exclusively locked inode->i_rwsem */
@@ -46,7 +34,7 @@ static int netfs_block_o_direct(struct netfs_inode *ictx)
if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
return 0;
clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
- return inode_dio_wait_interruptible(&ictx->inode);
+ return netfs_inode_dio_wait_interruptible(&ictx->inode);
}
/**
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5f0f438e5d21..6c7be1377ee0 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -36,13 +36,14 @@ DEFINE_SPINLOCK(netfs_proc_lock);
static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READAHEAD] = "RA",
[NETFS_READPAGE] = "RP",
+ [NETFS_READ_GAPS] = "RG",
[NETFS_READ_FOR_WRITE] = "RW",
- [NETFS_COPY_TO_CACHE] = "CC",
+ [NETFS_DIO_READ] = "DR",
[NETFS_WRITEBACK] = "WB",
[NETFS_WRITETHROUGH] = "WT",
[NETFS_UNBUFFERED_WRITE] = "UW",
- [NETFS_DIO_READ] = "DR",
[NETFS_DIO_WRITE] = "DW",
+ [NETFS_PGPRIV2_COPY_TO_CACHE] = "2C",
};
/*
@@ -62,7 +63,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
- "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx",
+ "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),
@@ -142,7 +143,7 @@ static int __init netfs_init(void)
error_fscache:
error_procfile:
- remove_proc_entry("fs/netfs", NULL);
+ remove_proc_subtree("fs/netfs", NULL);
error_proc:
mempool_exit(&netfs_subrequest_pool);
error_subreqpool:
@@ -159,7 +160,7 @@ fs_initcall(netfs_init);
static void __exit netfs_exit(void)
{
fscache_exit();
- remove_proc_entry("fs/netfs", NULL);
+ remove_proc_subtree("fs/netfs", NULL);
mempool_exit(&netfs_subrequest_pool);
kmem_cache_destroy(netfs_subrequest_slab);
mempool_exit(&netfs_request_pool);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 83e644bd518f..0ad0982ce0e2 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,6 +8,100 @@
#include <linux/swap.h>
#include "internal.h"
+/*
+ * Append a folio to the rolling queue.
+ */
+int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
+ bool needs_put)
+{
+ struct folio_queue *tail = rreq->buffer_tail;
+ unsigned int slot, order = folio_order(folio);
+
+ if (WARN_ON_ONCE(!rreq->buffer && tail) ||
+ WARN_ON_ONCE(rreq->buffer && !tail))
+ return -EIO;
+
+ if (!tail || folioq_full(tail)) {
+ tail = kmalloc(sizeof(*tail), GFP_NOFS);
+ if (!tail)
+ return -ENOMEM;
+ netfs_stat(&netfs_n_folioq);
+ folioq_init(tail);
+ tail->prev = rreq->buffer_tail;
+ if (tail->prev)
+ tail->prev->next = tail;
+ rreq->buffer_tail = tail;
+ if (!rreq->buffer) {
+ rreq->buffer = tail;
+ iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0);
+ }
+ rreq->buffer_tail_slot = 0;
+ }
+
+ rreq->io_iter.count += PAGE_SIZE << order;
+
+ slot = folioq_append(tail, folio);
+ /* Store the counter after setting the slot. */
+ smp_store_release(&rreq->buffer_tail_slot, slot);
+ return 0;
+}
+
+/*
+ * Delete the head of a rolling queue.
+ */
+struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq)
+{
+ struct folio_queue *head = wreq->buffer, *next = head->next;
+
+ if (next)
+ next->prev = NULL;
+ netfs_stat_d(&netfs_n_folioq);
+ kfree(head);
+ wreq->buffer = next;
+ return next;
+}
+
+/*
+ * Clear out a rolling queue.
+ */
+void netfs_clear_buffer(struct netfs_io_request *rreq)
+{
+ struct folio_queue *p;
+
+ while ((p = rreq->buffer)) {
+ rreq->buffer = p->next;
+ for (int slot = 0; slot < folioq_nr_slots(p); slot++) {
+ struct folio *folio = folioq_folio(p, slot);
+ if (!folio)
+ continue;
+ if (folioq_is_marked(p, slot)) {
+ trace_netfs_folio(folio, netfs_folio_trace_put);
+ folio_put(folio);
+ }
+ }
+ netfs_stat_d(&netfs_n_folioq);
+ kfree(p);
+ }
+}
+
+/*
+ * Reset the subrequest iterator to refer just to the region remaining to be
+ * read. The iterator may or may not have been advanced by socket ops or
+ * extraction ops to an extent that may or may not match the amount actually
+ * read.
+ */
+void netfs_reset_iter(struct netfs_io_subrequest *subreq)
+{
+ struct iov_iter *io_iter = &subreq->io_iter;
+ size_t remain = subreq->len - subreq->transferred;
+
+ if (io_iter->count > remain)
+ iov_iter_advance(io_iter, io_iter->count - remain);
+ else if (io_iter->count < remain)
+ iov_iter_revert(io_iter, remain - io_iter->count);
+ iov_iter_truncate(&subreq->io_iter, remain);
+}
+
/**
* netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
* @mapping: The mapping the folio belongs to.
@@ -97,10 +191,22 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
struct netfs_folio *finfo;
+ struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
size_t flen = folio_size(folio);
_enter("{%lx},%zx,%zx", folio->index, offset, length);
+ if (offset == 0 && length == flen) {
+ unsigned long long i_size = i_size_read(&ctx->inode);
+ unsigned long long fpos = folio_pos(folio), end;
+
+ end = umin(fpos + flen, i_size);
+ if (fpos < i_size && end > ctx->zero_point)
+ ctx->zero_point = end;
+ }
+
+ folio_wait_private_2(folio); /* [DEPRECATED] */
+
if (!folio_test_private(folio))
return;
@@ -113,18 +219,34 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
/* We have a partially uptodate page from a streaming write. */
unsigned int fstart = finfo->dirty_offset;
unsigned int fend = fstart + finfo->dirty_len;
- unsigned int end = offset + length;
+ unsigned int iend = offset + length;
if (offset >= fend)
return;
- if (end <= fstart)
+ if (iend <= fstart)
return;
- if (offset <= fstart && end >= fend)
- goto erase_completely;
- if (offset <= fstart && end > fstart)
- goto reduce_len;
- if (offset > fstart && end >= fend)
- goto move_start;
+
+ /* The invalidation region overlaps the data. If the region
+ * covers the start of the data, we either move along the start
+ * or just erase the data entirely.
+ */
+ if (offset <= fstart) {
+ if (iend >= fend)
+ goto erase_completely;
+ /* Move the start of the data. */
+ finfo->dirty_len = fend - iend;
+ finfo->dirty_offset = offset;
+ return;
+ }
+
+ /* Reduce the length of the data if the invalidation region
+ * covers the tail part.
+ */
+ if (iend >= fend) {
+ finfo->dirty_len = offset - fstart;
+ return;
+ }
+
/* A partial write was split. The caller has already zeroed
* it, so just absorb the hole.
*/
@@ -137,12 +259,6 @@ erase_completely:
folio_clear_uptodate(folio);
kfree(finfo);
return;
-reduce_len:
- finfo->dirty_len = offset + length - finfo->dirty_offset;
- return;
-move_start:
- finfo->dirty_len -= offset - finfo->dirty_offset;
- finfo->dirty_offset = offset;
}
EXPORT_SYMBOL(netfs_invalidate_folio);
@@ -159,12 +275,20 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
unsigned long long end;
- end = folio_pos(folio) + folio_size(folio);
+ if (folio_test_dirty(folio))
+ return false;
+
+ end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode));
if (end > ctx->zero_point)
ctx->zero_point = end;
if (folio_test_private(folio))
return false;
+ if (unlikely(folio_test_private_2(folio))) { /* [DEPRECATED] */
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_private_2(folio);
+ }
fscache_note_page_release(netfs_i_cookie(ctx));
return true;
}
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index f4a642727479..31e388ec6e48 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -24,10 +24,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct netfs_io_request *rreq;
mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
struct kmem_cache *cache = mempool->pool_data;
- bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
- origin == NETFS_DIO_READ ||
- origin == NETFS_DIO_WRITE);
- bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
for (;;) {
@@ -40,7 +36,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
memset(rreq, 0, kmem_cache_size(cache));
rreq->start = start;
rreq->len = len;
- rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
@@ -48,20 +43,24 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
rreq->wsize = INT_MAX;
+ rreq->io_streams[0].sreq_max_len = ULONG_MAX;
+ rreq->io_streams[0].sreq_max_segs = 0;
spin_lock_init(&rreq->lock);
INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
INIT_LIST_HEAD(&rreq->subrequests);
- INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
+ if (origin == NETFS_READAHEAD ||
+ origin == NETFS_READPAGE ||
+ origin == NETFS_READ_GAPS ||
+ origin == NETFS_READ_FOR_WRITE ||
+ origin == NETFS_DIO_READ)
+ INIT_WORK(&rreq->work, netfs_read_termination_worker);
+ else
+ INIT_WORK(&rreq->work, netfs_write_collection_worker);
+
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (cached) {
- __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
- if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags))
- /* Filesystem uses deprecated PG_private_2 marking. */
- __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
- }
if (file && file->f_flags & O_NONBLOCK)
__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
@@ -144,6 +143,7 @@ static void netfs_free_request(struct work_struct *work)
}
kvfree(rreq->direct_bv);
}
+ netfs_clear_buffer(rreq);
if (atomic_dec_and_test(&ictx->io_count))
wake_up_var(&ictx->io_count);
@@ -165,7 +165,7 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
if (was_async) {
rreq->work.func = netfs_free_request;
if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
+ WARN_ON(1);
} else {
netfs_free_request(&rreq->work);
}
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
new file mode 100644
index 000000000000..b18c65ba5580
--- /dev/null
+++ b/fs/netfs/read_collect.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem read subrequest result collection, assessment and
+ * retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * Clear the unread part of an I/O request.
+ */
+static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
+{
+ netfs_reset_iter(subreq);
+ WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
+ if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
+ __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
+}
+
+/*
+ * Flush, mark and unlock a folio that's now completely read. If we want to
+ * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
+ * dirty and let writeback handle it.
+ */
+static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
+ struct netfs_io_request *rreq,
+ struct folio_queue *folioq,
+ int slot)
+{
+ struct netfs_folio *finfo;
+ struct folio *folio = folioq_folio(folioq, slot);
+
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+
+ if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
+
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
+ folio_mark_dirty(folio);
+ }
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_done);
+ }
+ } else {
+ // TODO: Use of PG_private_2 is deprecated.
+ if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
+ }
+
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+ if (folio->index == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
+ _debug("no unlock");
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
+ folio_unlock(folio);
+ }
+ }
+}
+
+/*
+ * Unlock any folios that are now completely read. Returns true if the
+ * subrequest is removed from the list.
+ */
+static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async)
+{
+ struct netfs_io_subrequest *prev, *next;
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct folio_queue *folioq = subreq->curr_folioq;
+ size_t avail, prev_donated, next_donated, fsize, part, excess;
+ loff_t fpos, start;
+ loff_t fend;
+ int slot = subreq->curr_folioq_slot;
+
+ if (WARN(subreq->transferred > subreq->len,
+ "Subreq overread: R%x[%x] %zu > %zu",
+ rreq->debug_id, subreq->debug_index,
+ subreq->transferred, subreq->len))
+ subreq->transferred = subreq->len;
+
+next_folio:
+ fsize = PAGE_SIZE << subreq->curr_folio_order;
+ fpos = round_down(subreq->start + subreq->consumed, fsize);
+ fend = fpos + fsize;
+
+ if (WARN_ON_ONCE(!folioq) ||
+ WARN_ON_ONCE(!folioq_folio(folioq, slot)) ||
+ WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) {
+ pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->start + subreq->transferred - 1,
+ subreq->consumed, subreq->transferred, subreq->len,
+ slot);
+ if (folioq) {
+ struct folio *folio = folioq_folio(folioq, slot);
+
+ pr_err("folioq: orders=%02x%02x%02x%02x\n",
+ folioq->orders[0], folioq->orders[1],
+ folioq->orders[2], folioq->orders[3]);
+ if (folio)
+ pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
+ fpos, fend - 1, folio_pos(folio), folio_order(folio),
+ folioq_folio_order(folioq, slot));
+ }
+ }
+
+donation_changed:
+ /* Try to consume the current folio if we've hit or passed the end of
+ * it. There's a possibility that this subreq doesn't start at the
+ * beginning of the folio, in which case we need to donate to/from the
+ * preceding subreq.
+ *
+ * We also need to include any potential donation back from the
+ * following subreq.
+ */
+ prev_donated = READ_ONCE(subreq->prev_donated);
+ next_donated = READ_ONCE(subreq->next_donated);
+ if (prev_donated || next_donated) {
+ spin_lock_bh(&rreq->lock);
+ prev_donated = subreq->prev_donated;
+ next_donated = subreq->next_donated;
+ subreq->start -= prev_donated;
+ subreq->len += prev_donated;
+ subreq->transferred += prev_donated;
+ prev_donated = subreq->prev_donated = 0;
+ if (subreq->transferred == subreq->len) {
+ subreq->len += next_donated;
+ subreq->transferred += next_donated;
+ next_donated = subreq->next_donated = 0;
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
+ spin_unlock_bh(&rreq->lock);
+ }
+
+ avail = subreq->transferred;
+ if (avail == subreq->len)
+ avail += next_donated;
+ start = subreq->start;
+ if (subreq->consumed == 0) {
+ start -= prev_donated;
+ avail += prev_donated;
+ } else {
+ start += subreq->consumed;
+ avail -= subreq->consumed;
+ }
+ part = umin(avail, fsize);
+
+ trace_netfs_progress(subreq, start, avail, part);
+
+ if (start + avail >= fend) {
+ if (fpos == start) {
+ /* Flush, unlock and mark for caching any folio we've just read. */
+ subreq->consumed = fend - subreq->start;
+ netfs_unlock_read_folio(subreq, rreq, folioq, slot);
+ folioq_mark2(folioq, slot);
+ if (subreq->consumed >= subreq->len)
+ goto remove_subreq;
+ } else if (fpos < start) {
+ excess = fend - subreq->start;
+
+ spin_lock_bh(&rreq->lock);
+ /* If we complete first on a folio split with the
+ * preceding subreq, donate to that subreq - otherwise
+ * we get the responsibility.
+ */
+ if (subreq->prev_donated != prev_donated) {
+ spin_unlock_bh(&rreq->lock);
+ goto donation_changed;
+ }
+
+ if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
+ spin_unlock_bh(&rreq->lock);
+ pr_err("Can't donate prior to front\n");
+ goto bad;
+ }
+
+ prev = list_prev_entry(subreq, rreq_link);
+ WRITE_ONCE(prev->next_donated, prev->next_donated + excess);
+ subreq->start += excess;
+ subreq->len -= excess;
+ subreq->transferred -= excess;
+ trace_netfs_donate(rreq, subreq, prev, excess,
+ netfs_trace_donate_tail_to_prev);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
+
+ if (subreq->consumed >= subreq->len)
+ goto remove_subreq_locked;
+ spin_unlock_bh(&rreq->lock);
+ } else {
+ pr_err("fpos > start\n");
+ goto bad;
+ }
+
+ /* Advance the rolling buffer to the next folio. */
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ slot = 0;
+ folioq = folioq->next;
+ subreq->curr_folioq = folioq;
+ }
+ subreq->curr_folioq_slot = slot;
+ if (folioq && folioq_folio(folioq, slot))
+ subreq->curr_folio_order = folioq->orders[slot];
+ if (!was_async)
+ cond_resched();
+ goto next_folio;
+ }
+
+ /* Deal with partial progress. */
+ if (subreq->transferred < subreq->len)
+ return false;
+
+ /* Donate the remaining downloaded data to one of the neighbouring
+ * subrequests. Note that we may race with them doing the same thing.
+ */
+ spin_lock_bh(&rreq->lock);
+
+ if (subreq->prev_donated != prev_donated ||
+ subreq->next_donated != next_donated) {
+ spin_unlock_bh(&rreq->lock);
+ cond_resched();
+ goto donation_changed;
+ }
+
+ /* Deal with the trickiest case: that this subreq is in the middle of a
+ * folio, not touching either edge, but finishes first. In such a
+ * case, we donate to the previous subreq, if there is one, so that the
+ * donation is only handled when that completes - and remove this
+ * subreq from the list.
+ *
+ * If the previous subreq finished first, we will have acquired their
+ * donation and should be able to unlock folios and/or donate nextwards.
+ */
+ if (!subreq->consumed &&
+ !prev_donated &&
+ !list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
+ prev = list_prev_entry(subreq, rreq_link);
+ WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
+ subreq->start += subreq->len;
+ subreq->len = 0;
+ subreq->transferred = 0;
+ trace_netfs_donate(rreq, subreq, prev, subreq->len,
+ netfs_trace_donate_to_prev);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
+ goto remove_subreq_locked;
+ }
+
+ /* If we can't donate down the chain, donate up the chain instead. */
+ excess = subreq->len - subreq->consumed + next_donated;
+
+ if (!subreq->consumed)
+ excess += prev_donated;
+
+ if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ rreq->prev_donated = excess;
+ trace_netfs_donate(rreq, subreq, NULL, excess,
+ netfs_trace_donate_to_deferred_next);
+ } else {
+ next = list_next_entry(subreq, rreq_link);
+ WRITE_ONCE(next->prev_donated, excess);
+ trace_netfs_donate(rreq, subreq, next, excess,
+ netfs_trace_donate_to_next);
+ }
+ trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
+ subreq->len = subreq->consumed;
+ subreq->transferred = subreq->consumed;
+ goto remove_subreq_locked;
+
+remove_subreq:
+ spin_lock_bh(&rreq->lock);
+remove_subreq_locked:
+ subreq->consumed = subreq->len;
+ list_del(&subreq->rreq_link);
+ spin_unlock_bh(&rreq->lock);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
+ return true;
+
+bad:
+ /* Errr... prev and next both donated to us, but insufficient to finish
+ * the folio.
+ */
+ printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->start + subreq->transferred - 1,
+ subreq->consumed, subreq->transferred, subreq->len);
+ printk("folio: %llx-%llx\n", fpos, fend - 1);
+ printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
+ printk("s=%llx av=%zx part=%zx\n", start, avail, part);
+ BUG();
+}
+
+/*
+ * Do page flushing and suchlike after DIO.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ unsigned int i;
+
+ /* Collect unbuffered reads and direct reads, adding up the transfer
+ * sizes until we find the first short or failed subrequest.
+ */
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ rreq->transferred += subreq->transferred;
+
+ if (subreq->transferred < subreq->len ||
+ test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
+ rreq->error = subreq->error;
+ break;
+ }
+ }
+
+ if (rreq->origin == NETFS_DIO_READ) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+ // TODO: cifs marks pages in the destination buffer
+ // dirty under some circumstances after a read. Do we
+ // need to do that too?
+ set_page_dirty(rreq->direct_bv[i].bv_page);
+ }
+ }
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += rreq->transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_end(rreq->inode);
+}
+
+/*
+ * Assess the state of a read request and decide what to do next.
+ *
+ * Note that we're in normal kernel thread context at this point, possibly
+ * running on a workqueue.
+ */
+static void netfs_rreq_assess(struct netfs_io_request *rreq)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
+
+ //netfs_rreq_is_still_valid(rreq);
+
+ if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) {
+ netfs_retry_reads(rreq);
+ return;
+ }
+
+ if (rreq->origin == NETFS_DIO_READ ||
+ rreq->origin == NETFS_READ_GAPS)
+ netfs_rreq_assess_dio(rreq);
+ task_io_account_read(rreq->transferred);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+ netfs_clear_subrequests(rreq, false);
+ netfs_unlock_abandoned_read_pages(rreq);
+ if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)))
+ netfs_pgpriv2_write_to_the_cache(rreq);
+}
+
+void netfs_read_termination_worker(struct work_struct *work)
+{
+ struct netfs_io_request *rreq =
+ container_of(work, struct netfs_io_request, work);
+ netfs_see_request(rreq, netfs_rreq_trace_see_work);
+ netfs_rreq_assess(rreq);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete);
+}
+
+/*
+ * Handle the completion of all outstanding I/O operations on a read request.
+ * We inherit a ref from the caller.
+ */
+void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async)
+{
+ if (!was_async)
+ return netfs_rreq_assess(rreq);
+ if (!work_pending(&rreq->work)) {
+ netfs_get_request(rreq, netfs_rreq_trace_get_work);
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq);
+ }
+}
+
+/**
+ * netfs_read_subreq_progress - Note progress of a read operation.
+ * @subreq: The read request that has terminated.
+ * @was_async: True if we're in an asynchronous context.
+ *
+ * This tells the read side of netfs lib that a contributory I/O operation has
+ * made some progress and that it may be possible to unlock some folios.
+ *
+ * Before calling, the filesystem should update subreq->transferred to track
+ * the amount of data copied into the output buffer.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
+ bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
+
+ if (subreq->transferred > subreq->consumed &&
+ (rreq->origin == NETFS_READAHEAD ||
+ rreq->origin == NETFS_READPAGE ||
+ rreq->origin == NETFS_READ_FOR_WRITE)) {
+ netfs_consume_read_data(subreq, was_async);
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+}
+EXPORT_SYMBOL(netfs_read_subreq_progress);
+
+/**
+ * netfs_read_subreq_terminated - Note the termination of an I/O operation.
+ * @subreq: The I/O request that has terminated.
+ * @error: Error code indicating type of completion.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the read helper that a contributory I/O operation has terminated,
+ * one way or another, and that it should integrate the results.
+ *
+ * The caller indicates the outcome of the operation through @error, supplying
+ * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
+ * is set) or a negative error code. The helper will look after reissuing I/O
+ * operations as appropriate and writing downloaded data to the cache.
+ *
+ * Before calling, the filesystem should update subreq->transferred to track
+ * the amount of data copied into the output buffer.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
+ int error, bool was_async)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ switch (subreq->source) {
+ case NETFS_READ_FROM_CACHE:
+ netfs_stat(&netfs_n_rh_read_done);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_stat(&netfs_n_rh_download_done);
+ break;
+ default:
+ break;
+ }
+
+ if (rreq->origin != NETFS_DIO_READ) {
+ /* Collect buffered reads.
+ *
+ * If the read completed validly short, then we can clear the
+ * tail before going on to unlock the folios.
+ */
+ if (error == 0 && subreq->transferred < subreq->len &&
+ (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
+ test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
+ netfs_clear_unread(subreq);
+ subreq->transferred = subreq->len;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
+ }
+ if (subreq->transferred > subreq->consumed &&
+ (rreq->origin == NETFS_READAHEAD ||
+ rreq->origin == NETFS_READPAGE ||
+ rreq->origin == NETFS_READ_FOR_WRITE)) {
+ netfs_consume_read_data(subreq, was_async);
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+ rreq->transferred += subreq->transferred;
+ }
+
+ /* Deal with retry requests, short reads and errors. If we retry
+ * but don't make progress, we abandon the attempt.
+ */
+ if (!error && subreq->transferred < subreq->len) {
+ if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
+ } else {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_short);
+ if (subreq->transferred > subreq->consumed) {
+ __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
+ } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
+ } else {
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ error = -ENODATA;
+ }
+ }
+ }
+
+ subreq->error = error;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ if (unlikely(error < 0)) {
+ trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ netfs_stat(&netfs_n_rh_read_failed);
+ } else {
+ netfs_stat(&netfs_n_rh_download_failed);
+ set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+ rreq->error = subreq->error;
+ }
+ }
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ netfs_rreq_terminated(rreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(netfs_read_subreq_terminated);
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
new file mode 100644
index 000000000000..ba5af89d37fa
--- /dev/null
+++ b/fs/netfs/read_pgpriv2.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Read with PG_private_2 [DEPRECATED].
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
+
+/*
+ * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2. The
+ * third mark in the folio queue is used to indicate that this folio needs
+ * writing.
+ */
+void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
+ struct netfs_io_request *rreq,
+ struct folio_queue *folioq,
+ int slot)
+{
+ struct folio *folio = folioq_folio(folioq, slot);
+
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+ folio_start_private_2(folio);
+ folioq_mark3(folioq, slot);
+}
+
+/*
+ * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an
+ * unrecoverable error.
+ */
+static void netfs_pgpriv2_cancel(struct folio_queue *folioq)
+{
+ struct folio *folio;
+ int slot;
+
+ while (folioq) {
+ if (!folioq->marks3) {
+ folioq = folioq->next;
+ continue;
+ }
+
+ slot = __ffs(folioq->marks3);
+ folio = folioq_folio(folioq, slot);
+
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
+ folio_end_private_2(folio);
+ folioq_unmark3(folioq, slot);
+ }
+}
+
+/*
+ * [DEPRECATED] Copy a folio to the cache with PG_private_2 set.
+ */
+static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio *folio)
+{
+ struct netfs_io_stream *cache = &wreq->io_streams[1];
+ size_t fsize = folio_size(folio), flen = fsize;
+ loff_t fpos = folio_pos(folio), i_size;
+ bool to_eof = false;
+
+ _enter("");
+
+ /* netfs_perform_write() may shift i_size around the page or from out
+ * of the page to beyond it, but cannot move i_size into or through the
+ * page since we have it locked.
+ */
+ i_size = i_size_read(wreq->inode);
+
+ if (fpos >= i_size) {
+ /* mmap beyond eof. */
+ _debug("beyond eof");
+ folio_end_private_2(folio);
+ return 0;
+ }
+
+ if (fpos + fsize > wreq->i_size)
+ wreq->i_size = i_size;
+
+ if (flen > i_size - fpos) {
+ flen = i_size - fpos;
+ to_eof = true;
+ } else if (flen == i_size - fpos) {
+ to_eof = true;
+ }
+
+ _debug("folio %zx %zx", flen, fsize);
+
+ trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+
+ /* Attach the folio to the rolling buffer. */
+ if (netfs_buffer_append_folio(wreq, folio, false) < 0)
+ return -ENOMEM;
+
+ cache->submit_extendable_to = fsize;
+ cache->submit_off = 0;
+ cache->submit_len = flen;
+
+ /* Attach the folio to one or more subrequests. For a big folio, we
+ * could end up with thousands of subrequests if the wsize is small -
+ * but we might need to wait during the creation of subrequests for
+ * network resources (eg. SMB credits).
+ */
+ do {
+ ssize_t part;
+
+ wreq->io_iter.iov_offset = cache->submit_off;
+
+ atomic64_set(&wreq->issued_to, fpos + cache->submit_off);
+ cache->submit_extendable_to = fsize - cache->submit_off;
+ part = netfs_advance_write(wreq, cache, fpos + cache->submit_off,
+ cache->submit_len, to_eof);
+ cache->submit_off += part;
+ if (part > cache->submit_len)
+ cache->submit_len = 0;
+ else
+ cache->submit_len -= part;
+ } while (cache->submit_len > 0);
+
+ wreq->io_iter.iov_offset = 0;
+ iov_iter_advance(&wreq->io_iter, fsize);
+ atomic64_set(&wreq->issued_to, fpos + fsize);
+
+ if (flen < fsize)
+ netfs_issue_write(wreq, cache);
+
+ _leave(" = 0");
+ return 0;
+}
+
+/*
+ * [DEPRECATED] Go through the buffer and write any folios that are marked with
+ * the third mark to the cache.
+ */
+void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
+{
+ struct netfs_io_request *wreq;
+ struct folio_queue *folioq;
+ struct folio *folio;
+ int error = 0;
+ int slot = 0;
+
+ _enter("");
+
+ if (!fscache_resources_valid(&rreq->cache_resources))
+ goto couldnt_start;
+
+ /* Need the first folio to be able to set up the op. */
+ for (folioq = rreq->buffer; folioq; folioq = folioq->next) {
+ if (folioq->marks3) {
+ slot = __ffs(folioq->marks3);
+ break;
+ }
+ }
+ if (!folioq)
+ return;
+ folio = folioq_folio(folioq, slot);
+
+ wreq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio),
+ NETFS_PGPRIV2_COPY_TO_CACHE);
+ if (IS_ERR(wreq)) {
+ kleave(" [create %ld]", PTR_ERR(wreq));
+ goto couldnt_start;
+ }
+
+ trace_netfs_write(wreq, netfs_write_trace_copy_to_cache);
+ netfs_stat(&netfs_n_wh_copy_to_cache);
+
+ for (;;) {
+ error = netfs_pgpriv2_copy_folio(wreq, folio);
+ if (error < 0)
+ break;
+
+ folioq_unmark3(folioq, slot);
+ if (!folioq->marks3) {
+ folioq = folioq->next;
+ if (!folioq)
+ break;
+ }
+
+ slot = __ffs(folioq->marks3);
+ folio = folioq_folio(folioq, slot);
+ }
+
+ netfs_issue_write(wreq, &wreq->io_streams[1]);
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = %d", error);
+couldnt_start:
+ netfs_pgpriv2_cancel(rreq->buffer);
+}
+
+/*
+ * [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished
+ * copying.
+ */
+bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
+{
+ struct folio_queue *folioq = wreq->buffer;
+ unsigned long long collected_to = wreq->collected_to;
+ unsigned int slot = wreq->buffer_head_slot;
+ bool made_progress = false;
+
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = netfs_delete_buffer_head(wreq);
+ slot = 0;
+ }
+
+ for (;;) {
+ struct folio *folio;
+ unsigned long long fpos, fend;
+ size_t fsize, flen;
+
+ folio = folioq_folio(folioq, slot);
+ if (WARN_ONCE(!folio_test_private_2(folio),
+ "R=%08x: folio %lx is not marked private_2\n",
+ wreq->debug_id, folio->index))
+ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+
+ fpos = folio_pos(folio);
+ fsize = folio_size(folio);
+ flen = fsize;
+
+ fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
+
+ trace_netfs_collect_folio(wreq, folio, fend, collected_to);
+
+ /* Unlock any folio we've transferred all of. */
+ if (collected_to < fend)
+ break;
+
+ trace_netfs_folio(folio, netfs_folio_trace_end_copy);
+ folio_end_private_2(folio);
+ wreq->cleaned_to = fpos + fsize;
+ made_progress = true;
+
+ /* Clean up the head folioq. If we clear an entire folioq, then
+ * we can get rid of it provided it's not also the tail folioq
+ * being filled by the issuer.
+ */
+ folioq_clear(folioq, slot);
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ if (READ_ONCE(wreq->buffer_tail) == folioq)
+ break;
+ folioq = netfs_delete_buffer_head(wreq);
+ slot = 0;
+ }
+
+ if (fpos + fsize >= collected_to)
+ break;
+ }
+
+ wreq->buffer = folioq;
+ wreq->buffer_head_slot = slot;
+ return made_progress;
+}
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
new file mode 100644
index 000000000000..0350592ea804
--- /dev/null
+++ b/fs/netfs/read_retry.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem read subrequest retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static void netfs_reissue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ struct iov_iter *io_iter = &subreq->io_iter;
+
+ if (iov_iter_is_folioq(io_iter)) {
+ subreq->curr_folioq = (struct folio_queue *)io_iter->folioq;
+ subreq->curr_folioq_slot = io_iter->folioq_slot;
+ subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
+ }
+
+ atomic_inc(&rreq->nr_outstanding);
+ __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ subreq->rreq->netfs_ops->issue_read(subreq);
+}
+
+/*
+ * Go through the list of failed/short reads, retrying all retryable ones. We
+ * need to switch failed cache reads to network downloads.
+ */
+static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream0 = &rreq->io_streams[0];
+ LIST_HEAD(sublist);
+ LIST_HEAD(queue);
+
+ _enter("R=%x", rreq->debug_id);
+
+ if (list_empty(&rreq->subrequests))
+ return;
+
+ if (rreq->netfs_ops->retry_request)
+ rreq->netfs_ops->retry_request(rreq, NULL);
+
+ /* If there's no renegotiation to do, just resend each retryable subreq
+ * up to the first permanently failed one.
+ */
+ if (!rreq->netfs_ops->prepare_read &&
+ !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) {
+ struct netfs_io_subrequest *subreq;
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+ break;
+ if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ netfs_reset_iter(subreq);
+ netfs_reissue_read(rreq, subreq);
+ }
+ }
+ return;
+ }
+
+ /* Okay, we need to renegotiate all the download requests and flip any
+ * failed cache reads over to being download requests and negotiate
+ * those also. All fully successful subreqs have been removed from the
+ * list and any spare data from those has been donated.
+ *
+ * What we do is decant the list and rebuild it one subreq at a time so
+ * that we don't end up with donations jumping over a gap we're busy
+ * populating with smaller subrequests. In the event that the subreq
+ * we just launched finishes before we insert the next subreq, it'll
+ * fill in rreq->prev_donated instead.
+
+ * Note: Alternatively, we could split the tail subrequest right before
+ * we reissue it and fix up the donations under lock.
+ */
+ list_splice_init(&rreq->subrequests, &queue);
+
+ do {
+ struct netfs_io_subrequest *from;
+ struct iov_iter source;
+ unsigned long long start, len;
+ size_t part, deferred_next_donated = 0;
+ bool boundary = false;
+
+ /* Go through the subreqs and find the next span of contiguous
+ * buffer that we then rejig (cifs, for example, needs the
+ * rsize renegotiating) and reissue.
+ */
+ from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link);
+ list_move_tail(&from->rreq_link, &sublist);
+ start = from->start + from->transferred;
+ len = from->len - from->transferred;
+
+ _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx",
+ rreq->debug_id, from->debug_index,
+ from->start, from->consumed, from->transferred, from->len);
+
+ if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+ goto abandon;
+
+ deferred_next_donated = from->next_donated;
+ while ((subreq = list_first_entry_or_null(
+ &queue, struct netfs_io_subrequest, rreq_link))) {
+ if (subreq->start != start + len ||
+ subreq->transferred > 0 ||
+ !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+ break;
+ list_move_tail(&subreq->rreq_link, &sublist);
+ len += subreq->len;
+ deferred_next_donated = subreq->next_donated;
+ if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags))
+ break;
+ }
+
+ _debug(" - range: %llx-%llx %llx", start, start + len - 1, len);
+
+ /* Determine the set of buffers we're going to use. Each
+ * subreq gets a subset of a single overall contiguous buffer.
+ */
+ netfs_reset_iter(from);
+ source = from->io_iter;
+ source.count = len;
+
+ /* Work through the sublist. */
+ while ((subreq = list_first_entry_or_null(
+ &sublist, struct netfs_io_subrequest, rreq_link))) {
+ list_del(&subreq->rreq_link);
+
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->start = start - subreq->transferred;
+ subreq->len = len + subreq->transferred;
+ stream0->sreq_max_len = subreq->len;
+
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+
+ spin_lock_bh(&rreq->lock);
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+ subreq->prev_donated += rreq->prev_donated;
+ rreq->prev_donated = 0;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+ spin_unlock_bh(&rreq->lock);
+
+ BUG_ON(!len);
+
+ /* Renegotiate max_len (rsize) */
+ if (rreq->netfs_ops->prepare_read(subreq) < 0) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
+ __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ }
+
+ part = umin(len, stream0->sreq_max_len);
+ if (unlikely(rreq->io_streams[0].sreq_max_segs))
+ part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs);
+ subreq->len = subreq->transferred + part;
+ subreq->io_iter = source;
+ iov_iter_truncate(&subreq->io_iter, part);
+ iov_iter_advance(&source, part);
+ len -= part;
+ start += part;
+ if (!len) {
+ if (boundary)
+ __set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
+ subreq->next_donated = deferred_next_donated;
+ } else {
+ __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
+ subreq->next_donated = 0;
+ }
+
+ netfs_reissue_read(rreq, subreq);
+ if (!len)
+ break;
+
+ /* If we ran out of subrequests, allocate another. */
+ if (list_empty(&sublist)) {
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq)
+ goto abandon;
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->start = start;
+
+ /* We get two refs, but need just one. */
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_new);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_split);
+ list_add_tail(&subreq->rreq_link, &sublist);
+ }
+ }
+
+ /* If we managed to use fewer subreqs, we can discard the
+ * excess.
+ */
+ while ((subreq = list_first_entry_or_null(
+ &sublist, struct netfs_io_subrequest, rreq_link))) {
+ trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ }
+
+ } while (!list_empty(&queue));
+
+ return;
+
+ /* If we hit ENOMEM, fail all remaining subrequests */
+abandon:
+ list_splice_init(&sublist, &queue);
+ list_for_each_entry(subreq, &queue, rreq_link) {
+ if (!subreq->error)
+ subreq->error = -ENOMEM;
+ __clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
+ __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+ __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+ }
+ spin_lock_bh(&rreq->lock);
+ list_splice_tail_init(&queue, &rreq->subrequests);
+ spin_unlock_bh(&rreq->lock);
+}
+
+/*
+ * Retry reads.
+ */
+void netfs_retry_reads(struct netfs_io_request *rreq)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ netfs_retry_read_subrequests(rreq);
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ netfs_rreq_terminated(rreq, false);
+}
+
+/*
+ * Unlock any the pages that haven't been unlocked yet due to abandoned
+ * subrequests.
+ */
+void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)
+{
+ struct folio_queue *p;
+
+ for (p = rreq->buffer; p; p = p->next) {
+ for (int slot = 0; slot < folioq_count(p); slot++) {
+ struct folio *folio = folioq_folio(p, slot);
+
+ if (folio && !folioq_is_marked2(p, slot)) {
+ trace_netfs_folio(folio, netfs_folio_trace_abandon);
+ folio_unlock(folio);
+ }
+ }
+ }
+}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 0892768eea32..8e63516b40f6 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -32,6 +32,7 @@ atomic_t netfs_n_wh_buffered_write;
atomic_t netfs_n_wh_writethrough;
atomic_t netfs_n_wh_dio_write;
atomic_t netfs_n_wh_writepages;
+atomic_t netfs_n_wh_copy_to_cache;
atomic_t netfs_n_wh_wstream_conflict;
atomic_t netfs_n_wh_upload;
atomic_t netfs_n_wh_upload_done;
@@ -39,45 +40,53 @@ atomic_t netfs_n_wh_upload_failed;
atomic_t netfs_n_wh_write;
atomic_t netfs_n_wh_write_done;
atomic_t netfs_n_wh_write_failed;
+atomic_t netfs_n_wb_lock_skip;
+atomic_t netfs_n_wb_lock_wait;
+atomic_t netfs_n_folioq;
int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
+ seq_printf(m, "Reads : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n",
atomic_read(&netfs_n_rh_dio_read),
atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_read_folio),
atomic_read(&netfs_n_rh_write_begin),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n",
+ seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n",
atomic_read(&netfs_n_wh_buffered_write),
atomic_read(&netfs_n_wh_writethrough),
atomic_read(&netfs_n_wh_dio_write),
- atomic_read(&netfs_n_wh_writepages));
- seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
+ atomic_read(&netfs_n_wh_writepages),
+ atomic_read(&netfs_n_wh_copy_to_cache));
+ seq_printf(m, "ZeroOps: ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
+ seq_printf(m, "DownOps: DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
- seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
+ seq_printf(m, "CaRdOps: RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
- seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
+ seq_printf(m, "UpldOps: UL=%u us=%u uf=%u\n",
atomic_read(&netfs_n_wh_upload),
atomic_read(&netfs_n_wh_upload_done),
atomic_read(&netfs_n_wh_upload_failed));
- seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
+ seq_printf(m, "CaWrOps: WR=%u ws=%u wf=%u\n",
atomic_read(&netfs_n_wh_write),
atomic_read(&netfs_n_wh_write_done),
atomic_read(&netfs_n_wh_write_failed));
- seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
+ seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n",
atomic_read(&netfs_n_rh_rreq),
atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_folioq),
atomic_read(&netfs_n_wh_wstream_conflict));
+ seq_printf(m, "WbLock : skip=%u wait=%u\n",
+ atomic_read(&netfs_n_wb_lock_skip),
+ atomic_read(&netfs_n_wb_lock_wait));
return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 426cf87aaf2e..1d438be2e1b4 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -15,15 +15,11 @@
/* Notes made in the collector */
#define HIT_PENDING 0x01 /* A front op was still pending */
-#define SOME_EMPTY 0x02 /* One of more streams are empty */
-#define ALL_EMPTY 0x04 /* All streams are empty */
-#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */
-#define NEED_REASSESS 0x10 /* Need to loop round and reassess */
-#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */
-#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */
-#define BUFFERED 0x80 /* The pagecache needs cleaning up */
-#define NEED_RETRY 0x100 /* A front op requests retrying */
-#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */
+#define NEED_REASSESS 0x02 /* Need to loop round and reassess */
+#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
+#define BUFFERED 0x08 /* The pagecache needs cleaning up */
+#define NEED_RETRY 0x10 /* A front op requests retrying */
+#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */
/*
* Successful completion of write of a folio to the server and/or cache. Note
@@ -33,6 +29,7 @@
int netfs_folio_written_back(struct folio *folio)
{
enum netfs_folio_trace why = netfs_folio_trace_clear;
+ struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
struct netfs_folio *finfo;
struct netfs_group *group = NULL;
int gcount = 0;
@@ -41,6 +38,12 @@ int netfs_folio_written_back(struct folio *folio)
/* Streaming writes cannot be redirtied whilst under writeback,
* so discard the streaming record.
*/
+ unsigned long long fend;
+
+ fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
+ if (fend > ictx->zero_point)
+ ictx->zero_point = fend;
+
folio_detach_private(folio);
group = finfo->netfs_group;
gcount++;
@@ -75,55 +78,37 @@ end_wb:
}
/*
- * Get hold of a folio we have under writeback. We don't want to get the
- * refcount on it.
+ * Unlock any folios we've finished with.
*/
-static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos)
+static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
+ unsigned int *notes)
{
- XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE);
- struct folio *folio;
-
- rcu_read_lock();
-
- for (;;) {
- xas_reset(&xas);
- folio = xas_load(&xas);
- if (xas_retry(&xas, folio))
- continue;
+ struct folio_queue *folioq = wreq->buffer;
+ unsigned long long collected_to = wreq->collected_to;
+ unsigned int slot = wreq->buffer_head_slot;
- if (!folio || xa_is_value(folio))
- kdebug("R=%08x: folio %lx (%llx) not present",
- wreq->debug_id, xas.xa_index, pos / PAGE_SIZE);
- BUG_ON(!folio || xa_is_value(folio));
-
- if (folio == xas_reload(&xas))
- break;
+ if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
+ if (netfs_pgpriv2_unlock_copied_folios(wreq))
+ *notes |= MADE_PROGRESS;
+ return;
}
- rcu_read_unlock();
-
- if (WARN_ONCE(!folio_test_writeback(folio),
- "R=%08x: folio %lx is not under writeback\n",
- wreq->debug_id, folio->index)) {
- trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = netfs_delete_buffer_head(wreq);
+ slot = 0;
}
- return folio;
-}
-/*
- * Unlock any folios we've finished with.
- */
-static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
- unsigned long long collected_to,
- unsigned int *notes)
-{
for (;;) {
struct folio *folio;
struct netfs_folio *finfo;
unsigned long long fpos, fend;
size_t fsize, flen;
- folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to);
+ folio = folioq_folio(folioq, slot);
+ if (WARN_ONCE(!folio_test_writeback(folio),
+ "R=%08x: folio %lx is not under writeback\n",
+ wreq->debug_id, folio->index))
+ trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
fpos = folio_pos(folio);
fsize = folio_size(folio);
@@ -134,12 +119,6 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
trace_netfs_collect_folio(wreq, folio, fend, collected_to);
- if (fpos + fsize > wreq->contiguity) {
- trace_netfs_collect_contig(wreq, fpos + fsize,
- netfs_contig_trace_unlock);
- wreq->contiguity = fpos + fsize;
- }
-
/* Unlock any folio we've transferred all of. */
if (collected_to < fend)
break;
@@ -148,9 +127,25 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
wreq->cleaned_to = fpos + fsize;
*notes |= MADE_PROGRESS;
+ /* Clean up the head folioq. If we clear an entire folioq, then
+ * we can get rid of it provided it's not also the tail folioq
+ * being filled by the issuer.
+ */
+ folioq_clear(folioq, slot);
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ if (READ_ONCE(wreq->buffer_tail) == folioq)
+ break;
+ folioq = netfs_delete_buffer_head(wreq);
+ slot = 0;
+ }
+
if (fpos + fsize >= collected_to)
break;
}
+
+ wreq->buffer = folioq;
+ wreq->buffer_head_slot = slot;
}
/*
@@ -181,9 +176,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+ struct iov_iter source = subreq->io_iter;
+
+ iov_iter_revert(&source, subreq->len - source.count);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
- netfs_reissue_write(stream, subreq);
+ netfs_reissue_write(stream, subreq, &source);
}
}
return;
@@ -193,6 +191,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
do {
struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
+ struct iov_iter source;
unsigned long long start, len;
size_t part;
bool boundary = false;
@@ -220,6 +219,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
len += to->len;
}
+ /* Determine the set of buffers we're going to use. Each
+ * subreq gets a subset of a single overall contiguous buffer.
+ */
+ netfs_reset_iter(from);
+ source = from->io_iter;
+ source.count = len;
+
/* Work through the sublist. */
subreq = from;
list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
@@ -231,7 +237,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
stream->prepare_write(subreq);
- part = min(len, subreq->max_len);
+ part = min(len, stream->sreq_max_len);
subreq->len = part;
subreq->start = start;
subreq->transferred = 0;
@@ -242,7 +248,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
boundary = true;
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
- netfs_reissue_write(stream, subreq);
+ netfs_reissue_write(stream, subreq, &source);
if (subreq == to)
break;
}
@@ -271,8 +277,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
subreq = netfs_alloc_subrequest(wreq);
subreq->source = to->source;
subreq->start = start;
- subreq->max_len = len;
- subreq->max_nr_segs = INT_MAX;
subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
subreq->stream_nr = to->stream_nr;
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
@@ -286,10 +290,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
to = list_next_entry(to, rreq_link);
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+ stream->sreq_max_len = len;
+ stream->sreq_max_segs = INT_MAX;
switch (stream->source) {
case NETFS_UPLOAD_TO_SERVER:
netfs_stat(&netfs_n_wh_upload);
- subreq->max_len = min(len, wreq->wsize);
+ stream->sreq_max_len = umin(len, wreq->wsize);
break;
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write);
@@ -300,7 +306,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
stream->prepare_write(subreq);
- part = min(len, subreq->max_len);
+ part = umin(len, stream->sreq_max_len);
subreq->len = subreq->transferred + part;
len -= part;
start += part;
@@ -309,7 +315,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
boundary = false;
}
- netfs_reissue_write(stream, subreq);
+ netfs_reissue_write(stream, subreq, &source);
if (!len)
break;
@@ -370,7 +376,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
{
struct netfs_io_subrequest *front, *remove;
struct netfs_io_stream *stream;
- unsigned long long collected_to;
+ unsigned long long collected_to, issued_to;
unsigned int notes;
int s;
@@ -379,28 +385,22 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
reassess_streams:
+ issued_to = atomic64_read(&wreq->issued_to);
smp_rmb();
collected_to = ULLONG_MAX;
- if (wreq->origin == NETFS_WRITEBACK)
- notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG;
- else if (wreq->origin == NETFS_WRITETHROUGH)
- notes = ALL_EMPTY | BUFFERED;
+ if (wreq->origin == NETFS_WRITEBACK ||
+ wreq->origin == NETFS_WRITETHROUGH ||
+ wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
+ notes = BUFFERED;
else
- notes = ALL_EMPTY;
+ notes = 0;
/* Remove completed subrequests from the front of the streams and
* advance the completion point on each stream. We stop when we hit
* something that's in progress. The issuer thread may be adding stuff
* to the tail whilst we're doing this.
- *
- * We must not, however, merge in discontiguities that span whole
- * folios that aren't under writeback. This is made more complicated
- * by the folios in the gap being of unpredictable sizes - if they even
- * exist - but we don't want to look them up.
*/
for (s = 0; s < NR_IO_STREAMS; s++) {
- loff_t rstart, rend;
-
stream = &wreq->io_streams[s];
/* Read active flag before list pointers */
if (!smp_load_acquire(&stream->active))
@@ -412,26 +412,10 @@ reassess_streams:
//_debug("sreq [%x] %llx %zx/%zx",
// front->debug_index, front->start, front->transferred, front->len);
- /* Stall if there may be a discontinuity. */
- rstart = round_down(front->start, PAGE_SIZE);
- if (rstart > wreq->contiguity) {
- if (wreq->contiguity > stream->collected_to) {
- trace_netfs_collect_gap(wreq, stream,
- wreq->contiguity, 'D');
- stream->collected_to = wreq->contiguity;
- }
- notes |= REASSESS_DISCONTIG;
- break;
+ if (stream->collected_to < front->start) {
+ trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
+ stream->collected_to = front->start;
}
- rend = round_up(front->start + front->len, PAGE_SIZE);
- if (rend > wreq->contiguity) {
- trace_netfs_collect_contig(wreq, rend,
- netfs_contig_trace_collect);
- wreq->contiguity = rend;
- if (notes & REASSESS_DISCONTIG)
- notes |= NEED_REASSESS;
- }
- notes &= ~MAYBE_DISCONTIG;
/* Stall if the front is still undergoing I/O. */
if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
@@ -466,33 +450,27 @@ reassess_streams:
cancel:
/* Remove if completely consumed. */
- spin_lock(&wreq->lock);
+ spin_lock_bh(&wreq->lock);
remove = front;
list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
stream->front = front;
- if (!front) {
- unsigned long long jump_to = atomic64_read(&wreq->issued_to);
-
- if (stream->collected_to < jump_to) {
- trace_netfs_collect_gap(wreq, stream, jump_to, 'A');
- stream->collected_to = jump_to;
- }
- }
-
- spin_unlock(&wreq->lock);
+ spin_unlock_bh(&wreq->lock);
netfs_put_subrequest(remove, false,
notes & SAW_FAILURE ?
netfs_sreq_trace_put_cancel :
netfs_sreq_trace_put_done);
}
- if (front)
- notes &= ~ALL_EMPTY;
- else
- notes |= SOME_EMPTY;
+ /* If we have an empty stream, we need to jump it forward
+ * otherwise the collection point will never advance.
+ */
+ if (!front && issued_to > stream->collected_to) {
+ trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
+ stream->collected_to = issued_to;
+ }
if (stream->collected_to < collected_to)
collected_to = stream->collected_to;
@@ -501,36 +479,6 @@ reassess_streams:
if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
wreq->collected_to = collected_to;
- /* If we have an empty stream, we need to jump it forward over any gap
- * otherwise the collection point will never advance.
- *
- * Note that the issuer always adds to the stream with the lowest
- * so-far submitted start, so if we see two consecutive subreqs in one
- * stream with nothing between then in another stream, then the second
- * stream has a gap that can be jumped.
- */
- if (notes & SOME_EMPTY) {
- unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted);
-
- for (s = 0; s < NR_IO_STREAMS; s++) {
- stream = &wreq->io_streams[s];
- if (stream->active &&
- stream->front &&
- stream->front->start < jump_to)
- jump_to = stream->front->start;
- }
-
- for (s = 0; s < NR_IO_STREAMS; s++) {
- stream = &wreq->io_streams[s];
- if (stream->active &&
- !stream->front &&
- stream->collected_to < jump_to) {
- trace_netfs_collect_gap(wreq, stream, jump_to, 'B');
- stream->collected_to = jump_to;
- }
- }
- }
-
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->active)
@@ -541,43 +489,14 @@ reassess_streams:
/* Unlock any folios that we have now finished with. */
if (notes & BUFFERED) {
- unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity);
-
- if (wreq->cleaned_to < clean_to)
- netfs_writeback_unlock_folios(wreq, clean_to, &notes);
+ if (wreq->cleaned_to < wreq->collected_to)
+ netfs_writeback_unlock_folios(wreq, &notes);
} else {
wreq->cleaned_to = wreq->collected_to;
}
// TODO: Discard encryption buffers
- /* If all streams are discontiguous with the last folio we cleared, we
- * may need to skip a set of folios.
- */
- if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) {
- unsigned long long jump_to = ULLONG_MAX;
-
- for (s = 0; s < NR_IO_STREAMS; s++) {
- stream = &wreq->io_streams[s];
- if (stream->active && stream->front &&
- stream->front->start < jump_to)
- jump_to = stream->front->start;
- }
-
- trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump);
- wreq->contiguity = jump_to;
- wreq->cleaned_to = jump_to;
- wreq->collected_to = jump_to;
- for (s = 0; s < NR_IO_STREAMS; s++) {
- stream = &wreq->io_streams[s];
- if (stream->collected_to < jump_to)
- stream->collected_to = jump_to;
- }
- //cond_resched();
- notes |= MADE_PROGRESS;
- goto reassess_streams;
- }
-
if (notes & NEED_RETRY)
goto need_retry;
if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 9258d30cffe3..04e66d587f77 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -94,6 +94,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
{
struct netfs_io_request *wreq;
struct netfs_inode *ictx;
+ bool is_buffered = (origin == NETFS_WRITEBACK ||
+ origin == NETFS_WRITETHROUGH ||
+ origin == NETFS_PGPRIV2_COPY_TO_CACHE);
wreq = netfs_alloc_request(mapping, file, start, 0, origin);
if (IS_ERR(wreq))
@@ -102,12 +105,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
_enter("R=%x", wreq->debug_id);
ictx = netfs_inode(wreq->inode);
- if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ if (is_buffered && netfs_is_cache_enabled(ictx))
fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
- wreq->contiguity = wreq->start;
wreq->cleaned_to = wreq->start;
- INIT_WORK(&wreq->work, netfs_write_collection_worker);
wreq->io_streams[0].stream_nr = 0;
wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER;
@@ -156,22 +157,19 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
subreq = netfs_alloc_subrequest(wreq);
subreq->source = stream->source;
subreq->start = start;
- subreq->max_len = ULONG_MAX;
- subreq->max_nr_segs = INT_MAX;
subreq->stream_nr = stream->stream_nr;
+ subreq->io_iter = wreq->io_iter;
_enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
- trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
- refcount_read(&subreq->ref),
- netfs_sreq_trace_new);
-
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ stream->sreq_max_len = UINT_MAX;
+ stream->sreq_max_segs = INT_MAX;
switch (stream->source) {
case NETFS_UPLOAD_TO_SERVER:
netfs_stat(&netfs_n_wh_upload);
- subreq->max_len = wreq->wsize;
+ stream->sreq_max_len = wreq->wsize;
break;
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write);
@@ -190,7 +188,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
* the list. The collector only goes nextwards and uses the lock to
* remove entries off of the front.
*/
- spin_lock(&wreq->lock);
+ spin_lock_bh(&wreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
stream->front = subreq;
@@ -201,7 +199,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
}
}
- spin_unlock(&wreq->lock);
+ spin_unlock_bh(&wreq->lock);
stream->construct = subreq;
}
@@ -221,41 +219,34 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream,
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
return netfs_write_subrequest_terminated(subreq, subreq->error, false);
- // TODO: Use encrypted buffer
- if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) {
- subreq->io_iter = wreq->io_iter;
- iov_iter_advance(&subreq->io_iter,
- subreq->start + subreq->transferred - wreq->start);
- iov_iter_truncate(&subreq->io_iter,
- subreq->len - subreq->transferred);
- } else {
- iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- }
-
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
stream->issue_write(subreq);
}
void netfs_reissue_write(struct netfs_io_stream *stream,
- struct netfs_io_subrequest *subreq)
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *source)
{
+ size_t size = subreq->len - subreq->transferred;
+
+ // TODO: Use encrypted buffer
+ subreq->io_iter = *source;
+ iov_iter_advance(source, size);
+ iov_iter_truncate(&subreq->io_iter, size);
+
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
netfs_do_issue_write(stream, subreq);
}
-static void netfs_issue_write(struct netfs_io_request *wreq,
- struct netfs_io_stream *stream)
+void netfs_issue_write(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream)
{
struct netfs_io_subrequest *subreq = stream->construct;
if (!subreq)
return;
stream->construct = NULL;
-
- if (subreq->start + subreq->len > wreq->start + wreq->submitted)
- WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start);
+ subreq->io_iter.count = subreq->len;
netfs_do_issue_write(stream, subreq);
}
@@ -288,13 +279,14 @@ int netfs_advance_write(struct netfs_io_request *wreq,
netfs_prepare_write(wreq, stream, start);
subreq = stream->construct;
- part = min(subreq->max_len - subreq->len, len);
- _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
+ part = umin(stream->sreq_max_len - subreq->len, len);
+ _debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len);
subreq->len += part;
subreq->nr_segs++;
+ stream->submit_extendable_to -= part;
- if (subreq->len >= subreq->max_len ||
- subreq->nr_segs >= subreq->max_nr_segs ||
+ if (subreq->len >= stream->sreq_max_len ||
+ subreq->nr_segs >= stream->sreq_max_segs ||
to_eof) {
netfs_issue_write(wreq, stream);
subreq = NULL;
@@ -408,19 +400,26 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
folio_unlock(folio);
if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
- if (!fscache_resources_valid(&wreq->cache_resources)) {
+ if (!cache->avail) {
trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
netfs_issue_write(wreq, upload);
netfs_folio_written_back(folio);
return 0;
}
trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+ } else if (!upload->avail && !cache->avail) {
+ trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
+ netfs_folio_written_back(folio);
+ return 0;
} else if (!upload->construct) {
trace_netfs_folio(folio, netfs_folio_trace_store);
} else {
trace_netfs_folio(folio, netfs_folio_trace_store_plus);
}
+ /* Attach the folio to the rolling buffer. */
+ netfs_buffer_append_folio(wreq, folio, false);
+
/* Move the submission point forward to allow for write-streaming data
* not starting at the front of the page. We don't do write-streaming
* with the cache as the cache requires DIO alignment.
@@ -430,7 +429,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
*/
for (int s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
- stream->submit_max_len = fsize;
stream->submit_off = foff;
stream->submit_len = flen;
if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
@@ -438,7 +436,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
stream->submit_off = UINT_MAX;
stream->submit_len = 0;
- stream->submit_max_len = 0;
}
}
@@ -465,12 +462,13 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (choose_s < 0)
break;
stream = &wreq->io_streams[choose_s];
+ wreq->io_iter.iov_offset = stream->submit_off;
+ atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+ stream->submit_extendable_to = fsize - stream->submit_off;
part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
stream->submit_len, to_eof);
- atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
stream->submit_off += part;
- stream->submit_max_len -= part;
if (part > stream->submit_len)
stream->submit_len = 0;
else
@@ -479,6 +477,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
debug = true;
}
+ wreq->io_iter.iov_offset = 0;
+ iov_iter_advance(&wreq->io_iter, fsize);
atomic64_set(&wreq->issued_to, fpos + fsize);
if (!debug)
@@ -503,10 +503,14 @@ int netfs_writepages(struct address_space *mapping,
struct folio *folio;
int error = 0;
- if (wbc->sync_mode == WB_SYNC_ALL)
+ if (!mutex_trylock(&ictx->wb_lock)) {
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ netfs_stat(&netfs_n_wb_lock_skip);
+ return 0;
+ }
+ netfs_stat(&netfs_n_wb_lock_wait);
mutex_lock(&ictx->wb_lock);
- else if (!mutex_trylock(&ictx->wb_lock))
- return 0;
+ }
/* Need the first folio to be able to set up the op. */
folio = writeback_iter(mapping, wbc, NULL, &error);
@@ -523,10 +527,10 @@ int netfs_writepages(struct address_space *mapping,
netfs_stat(&netfs_n_wh_writepages);
do {
- _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
+ _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
/* It appears we don't have to handle cyclic writeback wrapping. */
- WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
+ WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to));
if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
@@ -670,6 +674,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
part = netfs_advance_write(wreq, upload, start, len, false);
start += part;
len -= part;
+ iov_iter_advance(&wreq->io_iter, part);
if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 29c49a7e5fe1..6df77f008d3f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -118,7 +118,9 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
if (likely(attrlen > 0))
bitmap[0] = ntohl(*p++);
if (attrlen > 1)
- bitmap[1] = ntohl(*p);
+ bitmap[1] = ntohl(*p++);
+ if (attrlen > 2)
+ bitmap[2] = ntohl(*p);
return 0;
}
@@ -446,7 +448,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
void *argp)
{
struct cb_recallanyargs *args = argp;
- uint32_t bitmap[2];
+ uint32_t bitmap[3];
__be32 *p, status;
p = xdr_inline_decode(xdr, 4);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d5edb3b3eeef..20cb2008f9e4 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -647,6 +647,9 @@ restart:
prev = delegation;
continue;
}
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ continue;
if (prev) {
struct inode *tmp = nfs_delegation_grab_inode(prev);
@@ -657,12 +660,6 @@ restart:
}
}
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- iput(to_put);
- goto restart;
- }
delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
@@ -1184,7 +1181,6 @@ static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
struct inode *inode;
restart:
rcu_read_lock();
-restart_locked:
list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
if (test_bit(NFS_DELEGATION_INODE_FREEING,
&delegation->flags) ||
@@ -1195,7 +1191,7 @@ restart_locked:
continue;
inode = nfs_delegation_grab_inode(delegation);
if (inode == NULL)
- goto restart_locked;
+ continue;
delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
if (delegation != NULL) {
@@ -1318,7 +1314,6 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
restart:
rcu_read_lock();
-restart_locked:
list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
if (test_bit(NFS_DELEGATION_INODE_FREEING,
&delegation->flags) ||
@@ -1330,7 +1325,7 @@ restart_locked:
continue;
inode = nfs_delegation_grab_inode(delegation);
if (inode == NULL)
- goto restart_locked;
+ continue;
spin_lock(&delegation->lock);
cred = get_cred_rcu(delegation->cred);
nfs4_stateid_copy(&stateid, &delegation->stateid);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 61a8cdb9f1e1..6800ee92d742 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -336,7 +336,7 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
* increment the page use counts until he is done with the page.
*/
static int nfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep,
+ loff_t pos, unsigned len, struct folio **foliop,
void **fsdata)
{
fgf_t fgp = FGP_WRITEBEGIN;
@@ -353,7 +353,7 @@ start:
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
- *pagep = &folio->page;
+ *foliop = folio;
ret = nfs_flush_incompatible(file, folio);
if (ret) {
@@ -372,10 +372,9 @@ start:
static int nfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct folio *folio = page_folio(page);
unsigned offset = offset_in_folio(folio, pos);
int status;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 7202ce84d0eb..810269ee0a50 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -265,6 +265,9 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi
{
rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
+ /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
+ rreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize;
return 0;
}
@@ -286,14 +289,6 @@ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sre
return netfs;
}
-static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq)
-{
- size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize;
-
- sreq->len = min(sreq->len, rsize);
- return true;
-}
-
static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
{
struct nfs_netfs_io_data *netfs;
@@ -302,17 +297,18 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
struct page *page;
unsigned long idx;
+ pgoff_t start, last;
int err;
- pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
- pgoff_t last = ((sreq->start + sreq->len -
- sreq->transferred - 1) >> PAGE_SHIFT);
+
+ start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
+ last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT);
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
netfs = nfs_netfs_alloc(sreq);
if (!netfs)
- return netfs_subreq_terminated(sreq, -ENOMEM, false);
+ return netfs_read_subreq_terminated(sreq, -ENOMEM, false);
pgio.pg_netfs = netfs; /* used in completion */
@@ -361,7 +357,8 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
return;
sreq = netfs->sreq;
- if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+ if (test_bit(NFS_IOHDR_EOF, &hdr->flags) &&
+ sreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
if (hdr->error)
@@ -377,5 +374,4 @@ const struct netfs_request_ops nfs_netfs_ops = {
.init_request = nfs_netfs_init_request,
.free_request = nfs_netfs_free_request,
.issue_read = nfs_netfs_issue_read,
- .clamp_length = nfs_netfs_clamp_length
};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index fbed0027996f..772d485e96d3 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -60,8 +60,6 @@ static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs)
static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
{
- ssize_t final_len;
-
/* Only the last RPC completion should call netfs_subreq_terminated() */
if (!refcount_dec_and_test(&netfs->refcount))
return;
@@ -74,15 +72,14 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
* Correct the final length here to be no larger than the netfs subrequest
* length, and thus avoid netfs's "Subreq overread" warning message.
*/
- final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred));
- netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false);
+ netfs->sreq->transferred = min_t(s64, netfs->sreq->len,
+ atomic64_read(&netfs->transferred));
+ netfs_read_subreq_terminated(netfs->sreq, netfs->error, false);
kfree(netfs);
}
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
{
netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
- /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
- __set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags);
}
extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8883016c551c..b8ffbe52ba15 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3931,7 +3931,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
FATTR4_WORD0_CASE_INSENSITIVE |
FATTR4_WORD0_CASE_PRESERVING;
if (minorversion)
- bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT |
+ FATTR4_WORD2_OPEN_ARGUMENTS;
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
@@ -9997,6 +9998,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
fallthrough;
default:
task->tk_status = 0;
+ lrp->res.lrs_present = 0;
fallthrough;
case 0:
break;
@@ -10010,9 +10012,11 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
task->tk_status = 0;
break;
case -NFS4ERR_DELAY:
- if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
- break;
- goto out_restart;
+ if (nfs4_async_handle_error(task, server, NULL, NULL) ==
+ -EAGAIN)
+ goto out_restart;
+ lrp->res.lrs_present = 0;
+ break;
}
return;
out_restart:
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index aa698481bec8..0d16b383a452 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1284,10 +1284,9 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
LIST_HEAD(freeme);
spin_lock(&inode->i_lock);
- if (!pnfs_layout_is_valid(lo) ||
- !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
goto out_unlock;
- if (stateid) {
+ if (stateid && pnfs_layout_is_valid(lo)) {
u32 seq = be32_to_cpu(arg_stateid->seqid);
pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index cbbd4866b0b7..97b386032b71 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
#include <linux/vfs.h>
#include <linux/inet.h>
#include <linux/in6.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <net/ipv6.h>
#include <linux/netdevice.h>
@@ -228,6 +229,7 @@ static int __nfs_list_for_each_server(struct list_head *head,
ret = fn(server, data);
if (ret)
goto out;
+ cond_resched();
rcu_read_lock();
}
rcu_read_unlock();
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a20c2c9d7d45..a366fb1c1b9b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2789,15 +2789,18 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
- spin_lock(&nf->fi_lock);
- file = find_any_file_locked(nf);
- if (file) {
- nfs4_show_superblock(s, file);
- seq_puts(s, ", ");
- nfs4_show_fname(s, file);
- seq_puts(s, ", ");
- }
- spin_unlock(&nf->fi_lock);
+ if (nf) {
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
+ if (file) {
+ nfs4_show_superblock(s, file);
+ seq_puts(s, ", ");
+ nfs4_show_fname(s, file);
+ seq_puts(s, ", ");
+ }
+ spin_unlock(&nf->fi_lock);
+ } else
+ seq_puts(s, "closed, ");
nfs4_show_owner(s, oo);
if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
seq_puts(s, ", admin-revoked");
@@ -3075,9 +3078,9 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
struct nfs4_delegation *dp =
container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
- nfs4_put_stid(&dp->dl_stid);
clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
+ nfs4_put_stid(&dp->dl_stid);
}
static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
@@ -8812,7 +8815,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
/**
* nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
* @rqstp: RPC transaction context
- * @inode: file to be checked for a conflict
+ * @dentry: dentry of inode to be checked for a conflict
* @modified: return true if file was modified
* @size: new size of file if modified is true
*
@@ -8827,16 +8830,16 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
* code is returned.
*/
__be32
-nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
bool *modified, u64 *size)
{
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct file_lock_context *ctx;
struct file_lease *fl;
- struct nfs4_delegation *dp;
struct iattr attrs;
struct nfs4_cb_fattr *ncf;
+ struct inode *inode = d_inode(dentry);
*modified = false;
ctx = locks_inode_context(inode);
@@ -8856,17 +8859,26 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
*/
if (type == F_RDLCK)
break;
- goto break_lease;
+
+ nfsd_stats_wdeleg_getattr_inc(nn);
+ spin_unlock(&ctx->flc_lock);
+
+ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+ if (status != nfserr_jukebox ||
+ !nfsd_wait_for_delegreturn(rqstp, inode))
+ return status;
+ return 0;
}
if (type == F_WRLCK) {
- dp = fl->c.flc_owner;
+ struct nfs4_delegation *dp = fl->c.flc_owner;
+
if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
spin_unlock(&ctx->flc_lock);
return 0;
}
-break_lease:
nfsd_stats_wdeleg_getattr_inc(nn);
dp = fl->c.flc_owner;
+ refcount_inc(&dp->dl_stid.sc_count);
ncf = &dp->dl_cb_fattr;
nfs4_cb_getattr(&dp->dl_cb_fattr);
spin_unlock(&ctx->flc_lock);
@@ -8876,27 +8888,37 @@ break_lease:
/* Recall delegation only if client didn't respond */
status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
if (status != nfserr_jukebox ||
- !nfsd_wait_for_delegreturn(rqstp, inode))
+ !nfsd_wait_for_delegreturn(rqstp, inode)) {
+ nfs4_put_stid(&dp->dl_stid);
return status;
+ }
}
if (!ncf->ncf_file_modified &&
(ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
ncf->ncf_file_modified = true;
if (ncf->ncf_file_modified) {
+ int err;
+
/*
* Per section 10.4.3 of RFC 8881, the server would
* not update the file's metadata with the client's
* modified size
*/
attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
- attrs.ia_valid = ATTR_MTIME | ATTR_CTIME;
- setattr_copy(&nop_mnt_idmap, inode, &attrs);
- mark_inode_dirty(inode);
+ attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG;
+ inode_lock(inode);
+ err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL);
+ inode_unlock(inode);
+ if (err) {
+ nfs4_put_stid(&dp->dl_stid);
+ return nfserrno(err);
+ }
ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
*size = ncf->ncf_cur_fsize;
*modified = true;
}
+ nfs4_put_stid(&dp->dl_stid);
return 0;
}
break;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 42b41d55d4ed..97f583777972 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3545,6 +3545,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
args.dentry = dentry;
args.ignore_crossmnt = (ignore_crossmnt != 0);
args.acl = NULL;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ args.context = NULL;
+#endif
/*
* Make a local copy of the attribute bitmap that can be modified.
@@ -3562,7 +3565,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
}
args.size = 0;
if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
- status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
+ status = nfsd4_deleg_getattr_conflict(rqstp, dentry,
&file_modified, &size);
if (status)
goto out;
@@ -3617,7 +3620,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
args.contextsupport = false;
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- args.context = NULL;
if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 9e0ea6fc2aa3..34eb2c2cbcde 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2069,8 +2069,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
continue;
}
- ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa,
- SVC_SOCK_ANONYMOUS,
+ ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0,
get_current_cred());
/* always save the latest error */
if (ret < 0)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ffc217099d19..ec4559ecd193 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -781,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
}
extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
- struct inode *inode, bool *file_modified, u64 *size);
+ struct dentry *dentry, bool *file_modified, u64 *size);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 4a29b0138d75..4b3e19d74925 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -83,7 +83,7 @@ static int nilfs_prepare_chunk(struct folio *folio, unsigned int from,
{
loff_t pos = folio_pos(folio) + from;
- return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block);
+ return __block_write_begin(folio, pos, to - from, nilfs_get_block);
}
static void nilfs_commit_chunk(struct folio *folio,
@@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio,
int err;
nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to);
- copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL);
+ copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL);
if (pos + copied > dir->i_size)
i_size_write(dir, pos + copied);
if (IS_DIRSYNC(dir))
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7340a01d80e1..8661f452dba6 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -250,7 +250,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
static int nilfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -259,7 +259,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(err))
return err;
- err = block_write_begin(mapping, pos, len, pagep, nilfs_get_block);
+ err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block);
if (unlikely(err)) {
nilfs_write_failed(mapping, pos + len);
nilfs_transaction_abort(inode->i_sb);
@@ -269,16 +269,16 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
static int nilfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
unsigned int start = pos & (PAGE_SIZE - 1);
unsigned int nr_dirty;
int err;
- nr_dirty = nilfs_page_count_clean_buffers(page, start,
+ nr_dirty = nilfs_page_count_clean_buffers(&folio->page, start,
start + copied);
- copied = generic_write_end(file, mapping, pos, len, copied, page,
+ copied = generic_write_end(file, mapping, pos, len, copied, folio,
fsdata);
nilfs_set_file_dirty(inode, nr_dirty);
err = nilfs_transaction_commit(inode->i_sb);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index b638dc06df2f..ec61ce9f29a2 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -498,7 +498,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
struct inode *inode;
struct nilfs_recovery_block *rb, *n;
unsigned int blocksize = nilfs->ns_blocksize;
- struct page *page;
+ struct folio *folio;
loff_t pos;
int err = 0, err2 = 0;
@@ -512,7 +512,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
pos = rb->blkoff << inode->i_blkbits;
err = block_write_begin(inode->i_mapping, pos, blocksize,
- &page, nilfs_get_block);
+ &folio, nilfs_get_block);
if (unlikely(err)) {
loff_t isize = inode->i_size;
@@ -522,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
goto failed_inode;
}
- err = nilfs_recovery_copy_block(nilfs, rb, pos, page);
+ err = nilfs_recovery_copy_block(nilfs, rb, pos, &folio->page);
if (unlikely(err))
goto failed_page;
@@ -531,17 +531,17 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
goto failed_page;
block_write_end(NULL, inode->i_mapping, pos, blocksize,
- blocksize, page, NULL);
+ blocksize, folio, NULL);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
(*nr_salvaged_blocks)++;
goto next;
failed_page:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
failed_inode:
nilfs_warn(sb,
@@ -716,6 +716,33 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
}
/**
+ * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery
+ * @nilfs: nilfs object
+ */
+static void nilfs_abort_roll_forward(struct the_nilfs *nilfs)
+{
+ struct nilfs_inode_info *ii, *n;
+ LIST_HEAD(head);
+
+ /* Abandon inodes that have read recovery data */
+ spin_lock(&nilfs->ns_inode_lock);
+ list_splice_init(&nilfs->ns_dirty_files, &head);
+ spin_unlock(&nilfs->ns_inode_lock);
+ if (list_empty(&head))
+ return;
+
+ set_nilfs_purging(nilfs);
+ list_for_each_entry_safe(ii, n, &head, i_dirty) {
+ spin_lock(&nilfs->ns_inode_lock);
+ list_del_init(&ii->i_dirty);
+ spin_unlock(&nilfs->ns_inode_lock);
+
+ iput(&ii->vfs_inode);
+ }
+ clear_nilfs_purging(nilfs);
+}
+
+/**
* nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
* @nilfs: nilfs object
* @sb: super block instance
@@ -773,15 +800,19 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
if (unlikely(err)) {
nilfs_err(sb, "error %d writing segment for recovery",
err);
- goto failed;
+ goto put_root;
}
nilfs_finish_roll_forward(nilfs, ri);
}
- failed:
+put_root:
nilfs_put_root(root);
return err;
+
+failed:
+ nilfs_abort_roll_forward(nilfs);
+ goto put_root;
}
/**
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 0ca3110d6386..871ec35ea8e8 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1812,6 +1812,9 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
nilfs_abort_logs(&logs, ret ? : err);
list_splice_tail_init(&sci->sc_segbufs, &logs);
+ if (list_empty(&logs))
+ return; /* if the first segment buffer preparation failed */
+
nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
nilfs_free_incomplete_logs(&logs, nilfs);
@@ -2056,7 +2059,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
err = nilfs_segctor_begin_construction(sci, nilfs);
if (unlikely(err))
- goto out;
+ goto failed;
/* Update time stamp */
sci->sc_seg_ctime = ktime_get_real_seconds();
@@ -2120,10 +2123,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
return err;
failed_to_write:
- if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
- nilfs_redirty_inodes(&sci->sc_dirty_files);
-
failed:
+ if (mode == SC_LSEG_SR && nilfs_sc_cstage_get(sci) >= NILFS_ST_IFILE)
+ nilfs_redirty_inodes(&sci->sc_dirty_files);
if (nilfs_doing_gc())
nilfs_redirty_inodes(&sci->sc_gc_inodes);
nilfs_segctor_abort_construction(sci, nilfs, err);
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index a5569b7f47a3..14868a3dd592 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -836,9 +836,15 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- struct nilfs_super_block **sbp = nilfs->ns_sbp;
- u32 major = le32_to_cpu(sbp[0]->s_rev_level);
- u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
+ struct nilfs_super_block *raw_sb;
+ u32 major;
+ u16 minor;
+
+ down_read(&nilfs->ns_sem);
+ raw_sb = nilfs->ns_sbp[0];
+ major = le32_to_cpu(raw_sb->s_rev_level);
+ minor = le16_to_cpu(raw_sb->s_minor_rev_level);
+ up_read(&nilfs->ns_sem);
return sysfs_emit(buf, "%d.%d\n", major, minor);
}
@@ -856,8 +862,13 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- struct nilfs_super_block **sbp = nilfs->ns_sbp;
- u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
+ struct nilfs_super_block *raw_sb;
+ u64 dev_size;
+
+ down_read(&nilfs->ns_sem);
+ raw_sb = nilfs->ns_sbp[0];
+ dev_size = le64_to_cpu(raw_sb->s_dev_size);
+ up_read(&nilfs->ns_sem);
return sysfs_emit(buf, "%llu\n", dev_size);
}
@@ -879,9 +890,15 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- struct nilfs_super_block **sbp = nilfs->ns_sbp;
+ struct nilfs_super_block *raw_sb;
+ ssize_t len;
- return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid);
+ down_read(&nilfs->ns_sem);
+ raw_sb = nilfs->ns_sbp[0];
+ len = sysfs_emit(buf, "%pUb\n", raw_sb->s_uuid);
+ up_read(&nilfs->ns_sem);
+
+ return len;
}
static
@@ -889,10 +906,16 @@ ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr,
struct the_nilfs *nilfs,
char *buf)
{
- struct nilfs_super_block **sbp = nilfs->ns_sbp;
+ struct nilfs_super_block *raw_sb;
+ ssize_t len;
+
+ down_read(&nilfs->ns_sem);
+ raw_sb = nilfs->ns_sbp[0];
+ len = scnprintf(buf, sizeof(raw_sb->s_volume_name), "%s\n",
+ raw_sb->s_volume_name);
+ up_read(&nilfs->ns_sem);
- return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n",
- sbp[0]->s_volume_name);
+ return len;
}
static const char dev_readme_str[] =
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index f3669403fabf..46440fbb8662 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -110,7 +110,7 @@ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
prev = &dn->dn_next;
continue;
}
- fown = &dn->dn_filp->f_owner;
+ fown = file_f_owner(dn->dn_filp);
send_sigio(fown, dn->dn_fd, POLL_MSG);
if (dn->dn_mask & FS_DN_MULTISHOT)
prev = &dn->dn_next;
@@ -316,6 +316,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
+ error = file_f_owner_allocate(filp);
+ if (error)
+ goto out_err;
+
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
fsnotify_init_mark(new_fsn_mark, dnotify_group);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 97c37a9631e5..67ee176b8824 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -12,6 +12,7 @@
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
+#include <linux/mnt_namespace.h>
#include "mount.h"
#include "internal.h"
@@ -128,6 +129,30 @@ int open_related_ns(struct ns_common *ns,
}
EXPORT_SYMBOL_GPL(open_related_ns);
+static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
+ struct mnt_ns_info __user *uinfo, size_t usize,
+ struct mnt_ns_info *kinfo)
+{
+ /*
+ * If userspace and the kernel have the same struct size it can just
+ * be copied. If userspace provides an older struct, only the bits that
+ * userspace knows about will be copied. If userspace provides a new
+ * struct, only the bits that the kernel knows aobut will be copied and
+ * the size value will be set to the size the kernel knows about.
+ */
+ kinfo->size = min(usize, sizeof(*kinfo));
+ kinfo->mnt_ns_id = mnt_ns->seq;
+ kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts);
+ /* Subtract the root mount of the mount namespace. */
+ if (kinfo->nr_mounts)
+ kinfo->nr_mounts--;
+
+ if (copy_to_user(uinfo, kinfo, kinfo->size))
+ return -EFAULT;
+
+ return 0;
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
@@ -135,6 +160,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct pid_namespace *pid_ns;
struct task_struct *tsk;
struct ns_common *ns = get_proc_ns(file_inode(filp));
+ struct mnt_namespace *mnt_ns;
+ bool previous = false;
uid_t __user *argp;
uid_t uid;
int ret;
@@ -156,7 +183,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
case NS_GET_MNTNS_ID: {
- struct mnt_namespace *mnt_ns;
__u64 __user *idp;
__u64 id;
@@ -211,7 +237,79 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (!ret)
ret = -ESRCH;
- break;
+ return ret;
+ }
+ }
+
+ /* extensible ioctls */
+ switch (_IOC_NR(ioctl)) {
+ case _IOC_NR(NS_MNT_GET_INFO): {
+ struct mnt_ns_info kinfo = {};
+ struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
+ size_t usize = _IOC_SIZE(ioctl);
+
+ if (ns->ops->type != CLONE_NEWNS)
+ return -EINVAL;
+
+ if (!uinfo)
+ return -EINVAL;
+
+ if (usize < MNT_NS_INFO_SIZE_VER0)
+ return -EINVAL;
+
+ return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
+ }
+ case _IOC_NR(NS_MNT_GET_PREV):
+ previous = true;
+ fallthrough;
+ case _IOC_NR(NS_MNT_GET_NEXT): {
+ struct mnt_ns_info kinfo = {};
+ struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
+ struct path path __free(path_put) = {};
+ struct file *f __free(fput) = NULL;
+ size_t usize = _IOC_SIZE(ioctl);
+
+ if (ns->ops->type != CLONE_NEWNS)
+ return -EINVAL;
+
+ if (usize < MNT_NS_INFO_SIZE_VER0)
+ return -EINVAL;
+
+ if (previous)
+ mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
+ else
+ mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
+ if (IS_ERR(mnt_ns))
+ return PTR_ERR(mnt_ns);
+
+ ns = to_ns_common(mnt_ns);
+ /* Transfer ownership of @mnt_ns reference to @path. */
+ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+ if (ret)
+ return ret;
+
+ CLASS(get_unused_fd, fd)(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ f = dentry_open(&path, O_RDONLY, current_cred());
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+
+ if (uinfo) {
+ /*
+ * If @uinfo is passed return all information about the
+ * mount namespace as well.
+ */
+ ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
+ if (ret)
+ return ret;
+ }
+
+ /* Transfer reference of @f to caller's fdtable. */
+ fd_install(fd, no_free_ptr(f));
+ /* File descriptor is live so hand it off to the caller. */
+ return take_fd(fd);
}
default:
ret = -ENOTTY;
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index ca1ddc46bd86..6202895a4542 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -182,7 +182,7 @@ static int ntfs_extend_initialized_size(struct file *file,
for (;;) {
u32 zerofrom, len;
- struct page *page;
+ struct folio *folio;
u8 bits;
CLST vcn, lcn, clen;
@@ -208,14 +208,13 @@ static int ntfs_extend_initialized_size(struct file *file,
if (pos + len > new_valid)
len = new_valid - pos;
- err = ntfs_write_begin(file, mapping, pos, len, &page, NULL);
+ err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL);
if (err)
goto out;
- zero_user_segment(page, zerofrom, PAGE_SIZE);
+ folio_zero_range(folio, zerofrom, folio_size(folio));
- /* This function in any case puts page. */
- err = ntfs_write_end(file, mapping, pos, len, len, page, NULL);
+ err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL);
if (err < 0)
goto out;
pos += len;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 6b0bdc474e76..f672072e6bd4 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -901,7 +901,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
}
int ntfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, struct page **pagep, void **fsdata)
+ loff_t pos, u32 len, struct folio **foliop, void **fsdata)
{
int err;
struct inode *inode = mapping->host;
@@ -910,7 +910,6 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
- *pagep = NULL;
if (is_resident(ni)) {
struct folio *folio = __filemap_get_folio(
mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
@@ -926,7 +925,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
ni_unlock(ni);
if (!err) {
- *pagep = &folio->page;
+ *foliop = folio;
goto out;
}
folio_unlock(folio);
@@ -936,7 +935,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
goto out;
}
- err = block_write_begin(mapping, pos, len, pagep,
+ err = block_write_begin(mapping, pos, len, foliop,
ntfs_get_block_write_begin);
out:
@@ -947,9 +946,8 @@ out:
* ntfs_write_end - Address_space_operations::write_end.
*/
int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
- u32 len, u32 copied, struct page *page, void *fsdata)
+ u32 len, u32 copied, struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
u64 valid = ni->i_valid;
@@ -979,7 +977,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
folio_unlock(folio);
folio_put(folio);
} else {
- err = generic_write_end(file, mapping, pos, len, copied, page,
+ err = generic_write_end(file, mapping, pos, len, copied, folio,
fsdata);
}
@@ -1008,45 +1006,6 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
return err;
}
-int reset_log_file(struct inode *inode)
-{
- int err;
- loff_t pos = 0;
- u32 log_size = inode->i_size;
- struct address_space *mapping = inode->i_mapping;
-
- for (;;) {
- u32 len;
- void *kaddr;
- struct page *page;
-
- len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE;
-
- err = block_write_begin(mapping, pos, len, &page,
- ntfs_get_block_write_begin);
- if (err)
- goto out;
-
- kaddr = kmap_atomic(page);
- memset(kaddr, -1, len);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
-
- err = block_write_end(NULL, mapping, pos, len, len, page, NULL);
- if (err < 0)
- goto out;
- pos += len;
-
- if (pos >= log_size)
- break;
- balance_dirty_pages_ratelimited(mapping);
- }
-out:
- mark_inode_dirty_sync(inode);
-
- return err;
-}
-
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index e5255a251929..584f814715f4 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -708,13 +708,12 @@ int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi,
struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
const struct cpu_str *name);
int ntfs_set_size(struct inode *inode, u64 new_size);
-int reset_log_file(struct inode *inode);
int ntfs_get_block(struct inode *inode, sector_t vbn,
struct buffer_head *bh_result, int create);
int ntfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, struct page **pagep, void **fsdata);
+ loff_t pos, u32 len, struct folio **foliop, void **fsdata);
int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
- u32 len, u32 copied, struct page *page, void *fsdata);
+ u32 len, u32 copied, struct folio *folio, void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 6be175a1ab3c..d6c985cc6353 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1643,7 +1643,7 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, ocfs2_write_type_t type,
- struct page **pagep, void **fsdata,
+ struct folio **foliop, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
@@ -1826,8 +1826,8 @@ try_again:
ocfs2_free_alloc_context(meta_ac);
success:
- if (pagep)
- *pagep = wc->w_target_page;
+ if (foliop)
+ *foliop = page_folio(wc->w_target_page);
*fsdata = wc;
return 0;
out_quota:
@@ -1879,7 +1879,7 @@ out:
static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
struct buffer_head *di_bh = NULL;
@@ -1901,7 +1901,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
- pagep, fsdata, di_bh, NULL);
+ foliop, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
@@ -2076,7 +2076,7 @@ out:
static int ocfs2_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
int ret;
struct inode *inode = mapping->host;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 3a520117fa59..45db1781ea73 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -38,7 +38,7 @@ typedef enum {
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, ocfs2_write_type_t type,
- struct page **pagep, void **fsdata,
+ struct folio **foliop, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
int ocfs2_read_inline_data(struct inode *inode, struct page *page,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f0beb173dbba..ccef3f42b333 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1932,6 +1932,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
{
int error = 0;
struct inode *inode = file_inode(file);
+ struct ocfs2_file_private *fp = file->private_data;
int lock_level = 0;
trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1952,7 +1953,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
goto bail_nolock;
}
- error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
+ error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false);
ocfs2_inode_unlock(inode, lock_level);
if (error)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ccc57038a977..ad131a2fc58e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -755,7 +755,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
u64 abs_to, struct buffer_head *di_bh)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
unsigned long index = abs_from >> PAGE_SHIFT;
handle_t *handle;
int ret = 0;
@@ -774,9 +774,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
goto out;
}
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (!page) {
- ret = -ENOMEM;
+ folio = __filemap_get_folio(mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
mlog_errno(ret);
goto out_commit_trans;
}
@@ -803,7 +804,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
* __block_write_begin and block_commit_write to zero the
* whole block.
*/
- ret = __block_write_begin(page, block_start + 1, 0,
+ ret = __block_write_begin(folio, block_start + 1, 0,
ocfs2_get_block);
if (ret < 0) {
mlog_errno(ret);
@@ -812,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/* must not update i_size! */
- block_commit_write(page, block_start + 1, block_start + 1);
+ block_commit_write(&folio->page, block_start + 1, block_start + 1);
}
/*
@@ -833,8 +834,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
}
out_unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out_commit_trans:
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -2750,6 +2751,13 @@ out_unlock:
return remapped > 0 ? remapped : ret;
}
+static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct ocfs2_file_private *fp = file->private_data;
+
+ return generic_llseek_cookie(file, offset, whence, &fp->cookie);
+}
+
const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
@@ -2797,7 +2805,7 @@ const struct file_operations ocfs2_fops = {
WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
const struct file_operations ocfs2_dops = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_dir_llseek,
.read = generic_read_dir,
.iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,
@@ -2843,7 +2851,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
};
const struct file_operations ocfs2_dops_no_plocks = {
- .llseek = generic_file_llseek,
+ .llseek = ocfs2_dir_llseek,
.read = generic_read_dir,
.iterate_shared = shared_ocfs2_readdir,
.fsync = ocfs2_sync_file,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 8e53e4ac1120..41e65e45a9f3 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -20,6 +20,7 @@ struct ocfs2_alloc_context;
enum ocfs2_alloc_restarted;
struct ocfs2_file_private {
+ u64 cookie;
struct file *fp_file;
struct mutex fp_mutex;
struct ocfs2_lock_res fp_flock;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 1834f26522ed..6ef4cb045ccd 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -53,7 +53,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
loff_t pos = page_offset(page);
unsigned int len = PAGE_SIZE;
pgoff_t last_index;
- struct page *locked_page = NULL;
+ struct folio *locked_folio = NULL;
void *fsdata;
loff_t size = i_size_read(inode);
@@ -91,7 +91,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
len = ((size - 1) & ~PAGE_MASK) + 1;
err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
- &locked_page, &fsdata, di_bh, page);
+ &locked_folio, &fsdata, di_bh, page);
if (err) {
if (err != -ENOSPC)
mlog_errno(err);
@@ -99,7 +99,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
goto out;
}
- if (!locked_page) {
+ if (!locked_folio) {
ret = VM_FAULT_NOPAGE;
goto out;
}
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 6b580b9da8e3..98358d405b6a 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -312,11 +312,11 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
static int omfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, pagep, omfs_get_block);
+ ret = block_write_begin(mapping, pos, len, foliop, omfs_get_block);
if (unlikely(ret))
omfs_write_failed(mapping, pos + len);
diff --git a/fs/open.c b/fs/open.c
index 22adbef7ecc2..daf1b55ca818 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -252,40 +252,39 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (offset < 0 || len <= 0)
return -EINVAL;
- /* Return error if mode is not supported */
- if (mode & ~FALLOC_FL_SUPPORTED_MASK)
+ if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
return -EOPNOTSUPP;
- /* Punch hole and zero range are mutually exclusive */
- if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
- (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
- return -EOPNOTSUPP;
-
- /* Punch hole must have keep size set */
- if ((mode & FALLOC_FL_PUNCH_HOLE) &&
- !(mode & FALLOC_FL_KEEP_SIZE))
+ /*
+ * Modes are exclusive, even if that is not obvious from the encoding
+ * as bit masks and the mix with the flag in the same namespace.
+ *
+ * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
+ * encoded as no bit set.
+ */
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_ALLOCATE_RANGE:
+ case FALLOC_FL_UNSHARE_RANGE:
+ case FALLOC_FL_ZERO_RANGE:
+ break;
+ case FALLOC_FL_PUNCH_HOLE:
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ return -EOPNOTSUPP;
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ case FALLOC_FL_INSERT_RANGE:
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+ break;
+ default:
return -EOPNOTSUPP;
-
- /* Collapse range should only be used exclusively. */
- if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
- (mode & ~FALLOC_FL_COLLAPSE_RANGE))
- return -EINVAL;
-
- /* Insert range should only be used exclusively. */
- if ((mode & FALLOC_FL_INSERT_RANGE) &&
- (mode & ~FALLOC_FL_INSERT_RANGE))
- return -EINVAL;
-
- /* Unshare range should only be used with allocate mode. */
- if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
- (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
- return -EINVAL;
+ }
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
/*
- * We can only allow pure fallocate on append only files
+ * On append-only files only space preallocation is supported.
*/
if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
return -EPERM;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index fdb9b65db1de..aae6d2b8767d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -309,22 +309,18 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
static int orangefs_write_begin(struct file *file,
struct address_space *mapping, loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct orangefs_write_range *wr;
struct folio *folio;
- struct page *page;
- pgoff_t index;
int ret;
- index = pos >> PAGE_SHIFT;
+ folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
-
- *pagep = page;
- folio = page_folio(page);
+ *foliop = folio;
if (folio_test_dirty(folio) && !folio_test_private(folio)) {
/*
@@ -365,9 +361,10 @@ okay:
}
static int orangefs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
+ loff_t pos, unsigned len, unsigned copied, struct folio *folio,
+ void *fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
/*
@@ -377,23 +374,23 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping,
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
- /* zero the stale part of the page if we did a short copy */
- if (!PageUptodate(page)) {
+ /* zero the stale part of the folio if we did a short copy */
+ if (!folio_test_uptodate(folio)) {
unsigned from = pos & (PAGE_SIZE - 1);
if (copied < len) {
- zero_user(page, from + copied, len - copied);
+ folio_zero_range(folio, from + copied, len - copied);
}
/* Set fully written pages uptodate. */
- if (pos == page_offset(page) &&
+ if (pos == folio_pos(folio) &&
(len == PAGE_SIZE || pos + len == inode->i_size)) {
- zero_user_segment(page, from + copied, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segment(folio, from + copied, PAGE_SIZE);
+ folio_mark_uptodate(folio);
}
}
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
mark_inode_dirty_sync(file_inode(file));
return copied;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index a5ef2005a2cc..2ed6ad641a20 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -115,12 +115,12 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de
continue;
error = security_inode_copy_up_xattr(old, name);
- if (error < 0 && error != -EOPNOTSUPP)
- break;
- if (error == 1) {
+ if (error == -ECANCELED) {
error = 0;
continue; /* Discard */
}
+ if (error < 0 && error != -EOPNOTSUPP)
+ break;
if (is_posix_acl_xattr(name)) {
error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name);
@@ -243,8 +243,24 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
return 0;
}
+static int ovl_sync_file(struct path *path)
+{
+ struct file *new_file;
+ int err;
+
+ new_file = ovl_path_open(path, O_LARGEFILE | O_RDONLY);
+ if (IS_ERR(new_file))
+ return PTR_ERR(new_file);
+
+ err = vfs_fsync(new_file, 0);
+ fput(new_file);
+
+ return err;
+}
+
static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
- struct file *new_file, loff_t len)
+ struct file *new_file, loff_t len,
+ bool datasync)
{
struct path datapath;
struct file *old_file;
@@ -342,7 +358,8 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
len -= bytes;
}
- if (!error && ovl_should_sync(ofs))
+ /* call fsync once, either now or later along with metadata */
+ if (!error && ovl_should_sync(ofs) && datasync)
error = vfs_fsync(new_file, 0);
out_fput:
fput(old_file);
@@ -574,6 +591,7 @@ struct ovl_copy_up_ctx {
bool indexed;
bool metacopy;
bool metacopy_digest;
+ bool metadata_fsync;
};
static int ovl_link_up(struct ovl_copy_up_ctx *c)
@@ -634,7 +652,8 @@ static int ovl_copy_up_data(struct ovl_copy_up_ctx *c, const struct path *temp)
if (IS_ERR(new_file))
return PTR_ERR(new_file);
- err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size);
+ err = ovl_copy_up_file(ofs, c->dentry, new_file, c->stat.size,
+ !c->metadata_fsync);
fput(new_file);
return err;
@@ -701,6 +720,10 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
err = ovl_set_attr(ofs, temp, &c->stat);
inode_unlock(temp->d_inode);
+ /* fsync metadata before moving it into upper dir */
+ if (!err && ovl_should_sync(ofs) && c->metadata_fsync)
+ err = ovl_sync_file(&upperpath);
+
return err;
}
@@ -860,7 +883,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
temp = tmpfile->f_path.dentry;
if (!c->metacopy && c->stat.size) {
- err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size);
+ err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size,
+ !c->metadata_fsync);
if (err)
goto out_fput;
}
@@ -1135,6 +1159,17 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
!kgid_has_mapping(current_user_ns(), ctx.stat.gid))
return -EOVERFLOW;
+ /*
+ * With metacopy disabled, we fsync after final metadata copyup, for
+ * both regular files and directories to get atomic copyup semantics
+ * on filesystems that do not use strict metadata ordering (e.g. ubifs).
+ *
+ * With metacopy enabled we want to avoid fsync on all meta copyup
+ * that will hurt performance of workloads such as chown -R, so we
+ * only fsync on data copyup as legacy behavior.
+ */
+ ctx.metadata_fsync = !OVL_FS(dentry->d_sb)->config.metacopy &&
+ (S_ISREG(ctx.stat.mode) || S_ISDIR(ctx.stat.mode));
ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags);
if (parent) {
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 4860fcc4611b..e42546c6c5df 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -353,6 +353,8 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
case Opt_datadir_add:
ctx->nr_data++;
fallthrough;
+ case Opt_lowerdir:
+ fallthrough;
case Opt_lowerdir_add:
WARN_ON(ctx->nr >= ctx->capacity);
l = &ctx->lower[ctx->nr++];
@@ -365,10 +367,9 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
}
}
-static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
- enum ovl_opt layer)
+static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer)
{
- char *name = kstrdup(param->string, GFP_KERNEL);
+ char *name = kstrdup(layer_name, GFP_KERNEL);
bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
struct path path;
int err;
@@ -376,7 +377,7 @@ static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
if (!name)
return -ENOMEM;
- if (upper)
+ if (upper || layer == Opt_lowerdir)
err = ovl_mount_dir(name, &path);
else
err = ovl_mount_dir_noesc(name, &path);
@@ -432,7 +433,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
{
int err;
struct ovl_fs_context *ctx = fc->fs_private;
- struct ovl_fs_context_layer *l;
char *dup = NULL, *iter;
ssize_t nr_lower, nr;
bool data_layer = false;
@@ -449,7 +449,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
return 0;
if (*name == ':') {
- pr_err("cannot append lower layer");
+ pr_err("cannot append lower layer\n");
return -EINVAL;
}
@@ -472,35 +472,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
goto out_err;
}
- if (nr_lower > ctx->capacity) {
- err = -ENOMEM;
- l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower),
- GFP_KERNEL_ACCOUNT);
- if (!l)
- goto out_err;
-
- ctx->lower = l;
- ctx->capacity = nr_lower;
- }
-
iter = dup;
- l = ctx->lower;
- for (nr = 0; nr < nr_lower; nr++, l++) {
- ctx->nr++;
- memset(l, 0, sizeof(*l));
-
- err = ovl_mount_dir(iter, &l->path);
- if (err)
- goto out_put;
-
- err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false);
+ for (nr = 0; nr < nr_lower; nr++) {
+ err = ovl_parse_layer(fc, iter, Opt_lowerdir);
if (err)
- goto out_put;
-
- err = -ENOMEM;
- l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT);
- if (!l->name)
- goto out_put;
+ goto out_err;
if (data_layer)
ctx->nr_data++;
@@ -517,8 +493,8 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
* there are no data layers.
*/
if (ctx->nr_data > 0) {
- pr_err("regular lower layers cannot follow data lower layers");
- goto out_put;
+ pr_err("regular lower layers cannot follow data lower layers\n");
+ goto out_err;
}
data_layer = false;
@@ -532,9 +508,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
kfree(dup);
return 0;
-out_put:
- ovl_reset_lowerdirs(ctx);
-
out_err:
kfree(dup);
@@ -582,7 +555,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_datadir_add:
case Opt_upperdir:
case Opt_workdir:
- err = ovl_parse_layer(fc, param, opt);
+ err = ovl_parse_layer(fc, param->string, opt);
break;
case Opt_default_permissions:
config->default_permissions = true;
@@ -782,11 +755,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
{
struct ovl_opt_set set = ctx->set;
- if (ctx->nr_data > 0 && !config->metacopy) {
- pr_err("lower data-only dirs require metacopy support.\n");
- return -EINVAL;
- }
-
/* Workdir/index are useless in non-upper mount */
if (!config->upperdir) {
if (config->workdir) {
@@ -938,6 +906,39 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
config->metacopy = false;
}
+ /*
+ * Fail if we don't have trusted xattr capability and a feature was
+ * explicitly requested that requires them.
+ */
+ if (!config->userxattr && !capable(CAP_SYS_ADMIN)) {
+ if (set.redirect &&
+ config->redirect_mode != OVL_REDIRECT_NOFOLLOW) {
+ pr_err("redirect_dir requires permission to access trusted xattrs\n");
+ return -EPERM;
+ }
+ if (config->metacopy && set.metacopy) {
+ pr_err("metacopy requires permission to access trusted xattrs\n");
+ return -EPERM;
+ }
+ if (config->verity_mode) {
+ pr_err("verity requires permission to access trusted xattrs\n");
+ return -EPERM;
+ }
+ if (ctx->nr_data > 0) {
+ pr_err("lower data-only dirs require permission to access trusted xattrs\n");
+ return -EPERM;
+ }
+ /*
+ * Other xattr-dependent features should be disabled without
+ * great disturbance to the user in ovl_make_workdir().
+ */
+ }
+
+ if (ctx->nr_data > 0 && !config->metacopy) {
+ pr_err("lower data-only dirs require metacopy support.\n");
+ return -EINVAL;
+ }
+
return 0;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 06a231970cb5..fe511192f83c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -202,15 +202,9 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
int ret;
ret = ovl_sync_status(ofs);
- /*
- * We have to always set the err, because the return value isn't
- * checked in syncfs, and instead indirectly return an error via
- * the sb's writeback errseq, which VFS inspects after this call.
- */
- if (ret < 0) {
- errseq_set(&sb->s_wb_err, -EIO);
+
+ if (ret < 0)
return -EIO;
- }
if (!ret)
return ret;
diff --git a/fs/pipe.c b/fs/pipe.c
index 7dff2aa50a6d..4083ba492cb6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -686,7 +686,7 @@ pipe_poll(struct file *filp, poll_table *wait)
if (filp->f_mode & FMODE_READ) {
if (!pipe_empty(head, tail))
mask |= EPOLLIN | EPOLLRDNORM;
- if (!pipe->writers && filp->f_version != pipe->w_counter)
+ if (!pipe->writers && filp->f_pipe != pipe->w_counter)
mask |= EPOLLHUP;
}
@@ -945,6 +945,7 @@ int create_pipe_files(struct file **res, int flags)
}
f->private_data = inode->i_pipe;
+ f->f_pipe = 0;
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
&pipefifo_fops);
@@ -954,6 +955,7 @@ int create_pipe_files(struct file **res, int flags)
return PTR_ERR(res[0]);
}
res[0]->private_data = inode->i_pipe;
+ res[0]->f_pipe = 0;
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
@@ -1108,7 +1110,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
int ret;
- filp->f_version = 0;
+ filp->f_pipe = 0;
spin_lock(&inode->i_lock);
if (inode->i_pipe) {
@@ -1155,7 +1157,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
if ((filp->f_flags & O_NONBLOCK)) {
/* suppress EPOLLHUP until we have
* seen a writer */
- filp->f_version = pipe->w_counter;
+ filp->f_pipe = pipe->w_counter;
} else {
if (wait_for_partner(pipe, &pipe->w_counter))
goto err_rd;
@@ -1427,7 +1429,7 @@ static const struct super_operations pipefs_ops = {
/*
* pipefs should _never_ be mounted by userland - too much of security hassle,
- * no real gain from having the whole whorehouse mounted. So we don't need
+ * no real gain from having the whole file system mounted. So we don't need
* any operations on the root directory. However, we need a non-trivial
* d_name - pipe: will go nicely and kill the special-casing in procfs.
*/
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 3f87297dbfdb..6c66a37522d0 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -715,8 +715,8 @@ int posix_acl_update_mode(struct mnt_idmap *idmap,
return error;
if (error == 0)
*acl = NULL;
- if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) &&
- !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
+ if (!in_group_or_capable(idmap, inode,
+ i_gid_into_vfsgid(idmap, inode)))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 72a1acd03675..e7810f3bd522 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -85,6 +85,7 @@
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
@@ -117,6 +118,40 @@
static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;
+enum proc_mem_force {
+ PROC_MEM_FORCE_ALWAYS,
+ PROC_MEM_FORCE_PTRACE,
+ PROC_MEM_FORCE_NEVER
+};
+
+static enum proc_mem_force proc_mem_force_override __ro_after_init =
+ IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
+ IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
+ PROC_MEM_FORCE_ALWAYS;
+
+static const struct constant_table proc_mem_force_table[] __initconst = {
+ { "always", PROC_MEM_FORCE_ALWAYS },
+ { "ptrace", PROC_MEM_FORCE_PTRACE },
+ { "never", PROC_MEM_FORCE_NEVER },
+ { }
+};
+
+static int __init early_proc_mem_force_override(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ /*
+ * lookup_constant() defaults to proc_mem_force_override to preseve
+ * the initial Kconfig choice in case an invalid param gets passed.
+ */
+ proc_mem_force_override = lookup_constant(proc_mem_force_table,
+ buf, proc_mem_force_override);
+
+ return 0;
+}
+early_param("proc_mem.force_override", early_proc_mem_force_override);
+
struct pid_entry {
const char *name;
unsigned int len;
@@ -827,12 +862,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
static int mem_open(struct inode *inode, struct file *file)
{
- int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
-
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+ return -EINVAL;
+ return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+}
- return ret;
+static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
+{
+ struct task_struct *task;
+ bool ptrace_active = false;
+
+ switch (proc_mem_force_override) {
+ case PROC_MEM_FORCE_NEVER:
+ return false;
+ case PROC_MEM_FORCE_PTRACE:
+ task = get_proc_task(file_inode(file));
+ if (task) {
+ ptrace_active = READ_ONCE(task->ptrace) &&
+ READ_ONCE(task->mm) == mm &&
+ READ_ONCE(task->parent) == current;
+ put_task_struct(task);
+ }
+ return ptrace_active;
+ default:
+ return true;
+ }
}
static ssize_t mem_rw(struct file *file, char __user *buf,
@@ -855,7 +909,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ flags = write ? FOLL_WRITE : 0;
+ if (proc_mem_foll_force(file, mm))
+ flags |= FOLL_FORCE;
while (count > 0) {
size_t this_len = min_t(size_t, count, PAGE_SIZE);
@@ -932,6 +988,7 @@ static const struct file_operations proc_mem_operations = {
.write = mem_write,
.open = mem_open,
.release = mem_release,
+ .fop_flags = FOP_UNSIGNED_OFFSET,
};
static int environ_open(struct inode *inode, struct file *file)
@@ -2276,8 +2333,8 @@ proc_map_files_instantiate(struct dentry *dentry,
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_map_files_dentry_operations);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2456,13 +2513,13 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->sighand)
return ERR_PTR(-ESRCH);
- return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+ return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
@@ -2491,7 +2548,7 @@ static int show_timer(struct seq_file *m, void *v)
[SIGEV_THREAD] = "thread",
};
- timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id);
@@ -2569,10 +2626,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
+ if (task_is_realtime(p))
+ slack_ns = 0;
+ else if (slack_ns == 0)
+ slack_ns = p->default_timer_slack_ns;
+ p->timer_slack_ns = slack_ns;
task_unlock(p);
out:
@@ -3870,12 +3928,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
+ /* We cache the tgid value that the last readdir call couldn't
+ * return and lseek resets it to 0.
*/
ns = proc_pid_ns(inode->i_sb);
- tid = (int)file->f_version;
- file->f_version = 0;
+ tid = (int)(intptr_t)file->private_data;
+ file->private_data = NULL;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
@@ -3890,7 +3948,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- file->f_version = (u64)tid;
+ file->private_data = (void *)(intptr_t)tid;
put_task_struct(task);
break;
}
@@ -3915,6 +3973,24 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
return 0;
}
+/*
+ * proc_task_readdir() set @file->private_data to a positive integer
+ * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ * here to catch any unexpected change in behavior either in
+ * proc_task_readdir() or generic_llseek_cookie().
+ */
+static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ u64 cookie = (u64)(intptr_t)file->private_data;
+ loff_t off;
+
+ off = generic_llseek_cookie(file, offset, whence, &cookie);
+ WARN_ON_ONCE(cookie > INT_MAX);
+ file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+ return off;
+}
+
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
@@ -3925,7 +4001,7 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir,
- .llseek = generic_file_llseek,
+ .llseek = proc_dir_llseek,
};
void __init set_proc_pid_nlink(void)
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index e0758fe7936d..b7cab1ad990d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v)
{ CON_ENABLED, 'E' },
{ CON_CONSDEV, 'C' },
{ CON_BOOT, 'B' },
+ { CON_NBCON, 'N' },
{ CON_PRINTBUFFER, 'p' },
{ CON_BRL, 'b' },
{ CON_ANYTIME, 'a' },
@@ -58,8 +59,8 @@ static int show_console_dev(struct seq_file *m, void *v)
seq_printf(m, "%s%d", con->name, con->index);
seq_pad(m, ' ');
seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
- con->write ? 'W' : '-', con->unblank ? 'U' : '-',
- flags);
+ ((con->flags & CON_NBCON) || con->write) ? 'W' : '-',
+ con->unblank ? 'U' : '-', flags);
if (dev)
seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
@@ -68,6 +69,7 @@ static int show_console_dev(struct seq_file *m, void *v)
}
static void *c_start(struct seq_file *m, loff_t *pos)
+ __acquires(&console_mutex)
{
struct console *con;
loff_t off = 0;
@@ -94,6 +96,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
}
static void c_stop(struct seq_file *m, void *v)
+ __releases(&console_mutex)
{
console_list_unlock();
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 586bbc84ca04..1f54a54bfb91 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -59,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v)
real_mount(file->f_path.mnt)->mnt_id,
file_inode(file)->i_ino);
- /* show_fd_locks() never deferences files so a stale value is safe */
+ /* show_fd_locks() never dereferences files, so a stale value is safe */
show_fd_locks(m, file, files);
if (seq_has_overflowed(m))
goto out;
@@ -220,8 +220,8 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry,
ei->op.proc_get_link = proc_fd_link;
tid_fd_update_inode(task, inode, data->mode);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -312,14 +312,14 @@ static int proc_readfd_count(struct inode *inode, loff_t *count)
return 0;
}
-static int proc_readfd(struct file *file, struct dir_context *ctx)
+static int proc_fd_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
}
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfd,
+ .iterate_shared = proc_fd_iterate,
.llseek = generic_file_llseek,
};
@@ -397,8 +397,8 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
inode->i_fop = &proc_fdinfo_file_operations;
tid_fd_update_inode(task, inode, 0);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *
@@ -407,7 +407,7 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
}
-static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
@@ -421,6 +421,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
const struct file_operations proc_fdinfo_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfdinfo,
+ .iterate_shared = proc_fdinfo_iterate,
.llseek = generic_file_llseek,
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index c02f1e63f82d..dbe82cf23ee4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -464,9 +464,9 @@ struct proc_dir_entry *proc_symlink(const char *name,
(S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
if (ent) {
- ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+ ent->size = strlen(dest);
+ ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL);
if (ent->data) {
- strcpy((char*)ent->data,dest);
ent->proc_iops = &proc_link_inode_operations;
ent = proc_register(parent, ent);
} else {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a8a8576d8592..9e3f25e4c188 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -349,3 +349,16 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde)
/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
pde->proc_dops = &proc_net_dentry_ops;
}
+
+/*
+ * Add a new procfs dentry that can't serve as a mountpoint. That should
+ * encompass anything that is ephemeral and can just disappear while the
+ * process is still around.
+ */
+static inline struct dentry *proc_splice_unmountable(struct inode *inode,
+ struct dentry *dentry, const struct dentry_operations *d_ops)
+{
+ d_set_d_op(dentry, d_ops);
+ dont_mount(dentry);
+ return d_splice_alias(inode, dentry);
+}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8e08a9a1b7ed..7d0acdad74e2 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -235,7 +235,7 @@ static int kcore_ram_list(struct list_head *list)
int nid, ret;
unsigned long end_pfn;
- /* Not inialized....update now */
+ /* Not initialized....update now */
/* find out "max pfn" */
end_pfn = 0;
for_each_node_state(nid, N_MEMORY) {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5f171ad7b436..ade74a396968 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -976,7 +976,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
+#if VM_PKEY_BIT3
[ilog2(VM_PKEY_BIT3)] = "",
+#endif
#if VM_PKEY_BIT4
[ilog2(VM_PKEY_BIT4)] = "",
#endif
@@ -987,8 +989,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
-#ifdef CONFIG_64BIT
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
[ilog2(VM_DROPPABLE)] = "dp",
+#endif
+#ifdef CONFIG_64BIT
[ilog2(VM_SEALED)] = "sl",
#endif
};
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 3497ede88aa0..84719e2bcbc4 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -288,13 +288,13 @@ static void pstore_dump(struct kmsg_dumper *dumper,
why = kmsg_dump_reason_str(reason);
if (pstore_cannot_block_path(reason)) {
- if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ if (!raw_spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
pr_err("dump skipped in %s path because of concurrent dump\n",
in_nmi() ? "NMI" : why);
return;
}
} else {
- spin_lock_irqsave(&psinfo->buf_lock, flags);
+ raw_spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -364,7 +364,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+ raw_spin_unlock_irqrestore(&psinfo->buf_lock, flags);
if (saved_ret) {
pr_err_once("backend (%s) writing error (%d)\n", psinfo->name,
@@ -503,7 +503,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- spin_lock_init(&psinfo->buf_lock);
+ raw_spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index c1cfb8a19e9d..b4d10e45f2e4 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -24,13 +24,15 @@ static unsigned qnx6_lfile_checksum(char *name, unsigned size)
return crc;
}
-static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
+static void *qnx6_get_folio(struct inode *dir, unsigned long n,
+ struct folio **foliop)
{
- struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
- if (!IS_ERR(page))
- kmap(page);
- return page;
+ struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL);
+
+ if (IS_ERR(folio))
+ return folio;
+ *foliop = folio;
+ return kmap_local_folio(folio, 0);
}
static unsigned last_entry(struct inode *inode, unsigned long page_nr)
@@ -44,19 +46,20 @@ static unsigned last_entry(struct inode *inode, unsigned long page_nr)
static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
struct qnx6_long_dir_entry *de,
- struct page **p)
+ struct folio **foliop)
{
struct qnx6_sb_info *sbi = QNX6_SB(sb);
u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */
u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */
- /* within page */
- u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK;
+ u32 offs;
struct address_space *mapping = sbi->longfile->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
- if (IS_ERR(page))
- return ERR_CAST(page);
- kmap(*p = page);
- return (struct qnx6_long_filename *)(page_address(page) + offs);
+ struct folio *folio = read_mapping_folio(mapping, n, NULL);
+
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
+ offs = offset_in_folio(folio, s << sb->s_blocksize_bits);
+ *foliop = folio;
+ return kmap_local_folio(folio, offs);
}
static int qnx6_dir_longfilename(struct inode *inode,
@@ -67,7 +70,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
struct qnx6_long_filename *lf;
struct super_block *s = inode->i_sb;
struct qnx6_sb_info *sbi = QNX6_SB(s);
- struct page *page;
+ struct folio *folio;
int lf_size;
if (de->de_size != 0xff) {
@@ -76,7 +79,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
pr_err("invalid direntry size (%i).\n", de->de_size);
return 0;
}
- lf = qnx6_longname(s, de, &page);
+ lf = qnx6_longname(s, de, &folio);
if (IS_ERR(lf)) {
pr_err("Error reading longname\n");
return 0;
@@ -87,7 +90,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
if (lf_size > QNX6_LONG_NAME_MAX) {
pr_debug("file %s\n", lf->lf_fname);
pr_err("Filename too long (%i)\n", lf_size);
- qnx6_put_page(page);
+ folio_release_kmap(folio, lf);
return 0;
}
@@ -100,11 +103,11 @@ static int qnx6_dir_longfilename(struct inode *inode,
pr_debug("qnx6_readdir:%.*s inode:%u\n",
lf_size, lf->lf_fname, de_inode);
if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
- qnx6_put_page(page);
+ folio_release_kmap(folio, lf);
return 0;
}
- qnx6_put_page(page);
+ folio_release_kmap(folio, lf);
/* success */
return 1;
}
@@ -117,26 +120,27 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
unsigned long npages = dir_pages(inode);
unsigned long n = pos >> PAGE_SHIFT;
- unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE;
+ unsigned offset = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE;
bool done = false;
ctx->pos = pos;
if (ctx->pos >= inode->i_size)
return 0;
- for ( ; !done && n < npages; n++, start = 0) {
- struct page *page = qnx6_get_page(inode, n);
- int limit = last_entry(inode, n);
+ for ( ; !done && n < npages; n++, offset = 0) {
struct qnx6_dir_entry *de;
- int i = start;
+ struct folio *folio;
+ char *kaddr = qnx6_get_folio(inode, n, &folio);
+ char *limit;
- if (IS_ERR(page)) {
+ if (IS_ERR(kaddr)) {
pr_err("%s(): read failed\n", __func__);
ctx->pos = (n + 1) << PAGE_SHIFT;
- return PTR_ERR(page);
+ return PTR_ERR(kaddr);
}
- de = ((struct qnx6_dir_entry *)page_address(page)) + start;
- for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
+ de = (struct qnx6_dir_entry *)(kaddr + offset);
+ limit = kaddr + last_entry(inode, n);
+ for (; (char *)de < limit; de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
int size = de->de_size;
u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
@@ -164,7 +168,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
}
}
}
- qnx6_put_page(page);
+ folio_release_kmap(folio, kaddr);
}
return 0;
}
@@ -177,23 +181,23 @@ static unsigned qnx6_long_match(int len, const char *name,
{
struct super_block *s = dir->i_sb;
struct qnx6_sb_info *sbi = QNX6_SB(s);
- struct page *page;
+ struct folio *folio;
int thislen;
- struct qnx6_long_filename *lf = qnx6_longname(s, de, &page);
+ struct qnx6_long_filename *lf = qnx6_longname(s, de, &folio);
if (IS_ERR(lf))
return 0;
thislen = fs16_to_cpu(sbi, lf->lf_size);
if (len != thislen) {
- qnx6_put_page(page);
+ folio_release_kmap(folio, lf);
return 0;
}
if (memcmp(name, lf->lf_fname, len) == 0) {
- qnx6_put_page(page);
+ folio_release_kmap(folio, lf);
return fs32_to_cpu(sbi, de->de_inode);
}
- qnx6_put_page(page);
+ folio_release_kmap(folio, lf);
return 0;
}
@@ -210,20 +214,17 @@ static unsigned qnx6_match(struct super_block *s, int len, const char *name,
}
-unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
- struct page **res_page)
+unsigned qnx6_find_ino(int len, struct inode *dir, const char *name)
{
struct super_block *s = dir->i_sb;
struct qnx6_inode_info *ei = QNX6_I(dir);
- struct page *page = NULL;
+ struct folio *folio;
unsigned long start, n;
unsigned long npages = dir_pages(dir);
unsigned ino;
struct qnx6_dir_entry *de;
struct qnx6_long_dir_entry *lde;
- *res_page = NULL;
-
if (npages == 0)
return 0;
start = ei->i_dir_start_lookup;
@@ -232,12 +233,11 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
n = start;
do {
- page = qnx6_get_page(dir, n);
- if (!IS_ERR(page)) {
+ de = qnx6_get_folio(dir, n, &folio);
+ if (!IS_ERR(de)) {
int limit = last_entry(dir, n);
int i;
- de = (struct qnx6_dir_entry *)page_address(page);
for (i = 0; i < limit; i++, de++) {
if (len <= QNX6_SHORT_NAME_MAX) {
/* short filename */
@@ -256,7 +256,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
} else
pr_err("undefined filename size in inode.\n");
}
- qnx6_put_page(page);
+ folio_release_kmap(folio, de - i);
}
if (++n >= npages)
@@ -265,8 +265,8 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
return 0;
found:
- *res_page = page;
ei->i_dir_start_lookup = n;
+ folio_release_kmap(folio, de);
return ino;
}
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 4f1735b882b1..85925ec0051a 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -184,17 +184,17 @@ static const char *qnx6_checkroot(struct super_block *s)
struct qnx6_dir_entry *dir_entry;
struct inode *root = d_inode(s->s_root);
struct address_space *mapping = root->i_mapping;
- struct page *page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
+ struct folio *folio = read_mapping_folio(mapping, 0, NULL);
+
+ if (IS_ERR(folio))
return "error reading root directory";
- kmap(page);
- dir_entry = page_address(page);
+ dir_entry = kmap_local_folio(folio, 0);
for (i = 0; i < 2; i++) {
/* maximum 3 bytes - due to match_root limitation */
if (strncmp(dir_entry[i].de_fname, match_root[i], 3))
error = 1;
}
- qnx6_put_page(page);
+ folio_release_kmap(folio, dir_entry);
if (error)
return "error reading root directory.";
return NULL;
@@ -518,7 +518,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
struct inode *inode;
struct qnx6_inode_info *ei;
struct address_space *mapping;
- struct page *page;
+ struct folio *folio;
u32 n, offs;
inode = iget_locked(sb, ino);
@@ -538,17 +538,16 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
return ERR_PTR(-EIO);
}
n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS);
- offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS);
mapping = sbi->inodes->i_mapping;
- page = read_mapping_page(mapping, n, NULL);
- if (IS_ERR(page)) {
+ folio = read_mapping_folio(mapping, n, NULL);
+ if (IS_ERR(folio)) {
pr_err("major problem: unable to read inode from dev %s\n",
sb->s_id);
iget_failed(inode);
- return ERR_CAST(page);
+ return ERR_CAST(folio);
}
- kmap(page);
- raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs;
+ offs = offset_in_folio(folio, (ino - 1) << QNX6_INODE_SIZE_BITS);
+ raw_inode = kmap_local_folio(folio, offs);
inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode);
i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid));
@@ -578,7 +577,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode->i_mapping->a_ops = &qnx6_aops;
} else
init_special_inode(inode, inode->i_mode, 0);
- qnx6_put_page(page);
+ folio_release_kmap(folio, raw_inode);
unlock_new_inode(inode);
return inode;
}
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index e2e98e653b8d..0f0755a9ecb5 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -17,7 +17,6 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
unsigned ino;
- struct page *page;
struct inode *foundinode = NULL;
const char *name = dentry->d_name.name;
int len = dentry->d_name.len;
@@ -25,10 +24,9 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
if (len > QNX6_LONG_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- ino = qnx6_find_entry(len, dir, name, &page);
+ ino = qnx6_find_ino(len, dir, name);
if (ino) {
foundinode = qnx6_iget(dir->i_sb, ino);
- qnx6_put_page(page);
if (IS_ERR(foundinode))
pr_debug("lookup->iget -> error %ld\n",
PTR_ERR(foundinode));
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index 34a6b126a3a9..56ed1367499e 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -126,11 +126,4 @@ static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n)
extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s,
int silent);
-static inline void qnx6_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
-extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
- struct page **res_page);
+unsigned qnx6_find_ino(int len, struct inode *dir, const char *name);
diff --git a/fs/read_write.c b/fs/read_write.c
index 90e283b31ca1..070a7c33b9dd 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -36,22 +36,24 @@ EXPORT_SYMBOL(generic_ro_fops);
static inline bool unsigned_offsets(struct file *file)
{
- return file->f_mode & FMODE_UNSIGNED_OFFSET;
+ return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
}
/**
- * vfs_setpos - update the file offset for lseek
+ * vfs_setpos_cookie - update the file offset for lseek and reset cookie
* @file: file structure in question
* @offset: file offset to seek to
* @maxsize: maximum file size
+ * @cookie: cookie to reset
*
- * This is a low-level filesystem helper for updating the file offset to
- * the value specified by @offset if the given offset is valid and it is
- * not equal to the current file offset.
+ * Update the file offset to the value specified by @offset if the given
+ * offset is valid and it is not equal to the current file offset and
+ * reset the specified cookie to indicate that a seek happened.
*
* Return the specified offset on success and -EINVAL on invalid offset.
*/
-loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
+static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
+ loff_t maxsize, u64 *cookie)
{
if (offset < 0 && !unsigned_offsets(file))
return -EINVAL;
@@ -60,35 +62,48 @@ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
if (offset != file->f_pos) {
file->f_pos = offset;
- file->f_version = 0;
+ if (cookie)
+ *cookie = 0;
}
return offset;
}
-EXPORT_SYMBOL(vfs_setpos);
/**
- * generic_file_llseek_size - generic llseek implementation for regular files
- * @file: file structure to seek on
+ * vfs_setpos - update the file offset for lseek
+ * @file: file structure in question
* @offset: file offset to seek to
- * @whence: type of seek
- * @maxsize: max size of this file in file system
- * @eof: offset used for SEEK_END position
+ * @maxsize: maximum file size
*
- * This is a variant of generic_file_llseek that allows passing in a custom
- * maximum file size and a custom EOF position, for e.g. hashed directories
+ * This is a low-level filesystem helper for updating the file offset to
+ * the value specified by @offset if the given offset is valid and it is
+ * not equal to the current file offset.
*
- * Synchronization:
- * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
- * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
- * read/writes behave like SEEK_SET against seeks.
+ * Return the specified offset on success and -EINVAL on invalid offset.
*/
-loff_t
-generic_file_llseek_size(struct file *file, loff_t offset, int whence,
- loff_t maxsize, loff_t eof)
+loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
+{
+ return vfs_setpos_cookie(file, offset, maxsize, NULL);
+}
+EXPORT_SYMBOL(vfs_setpos);
+
+/**
+ * must_set_pos - check whether f_pos has to be updated
+ * @file: file to seek on
+ * @offset: offset to use
+ * @whence: type of seek operation
+ * @eof: end of file
+ *
+ * Check whether f_pos needs to be updated and update @offset according
+ * to @whence.
+ *
+ * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
+ * updated, and negative error code on failure.
+ */
+static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
{
switch (whence) {
case SEEK_END:
- offset += eof;
+ *offset += eof;
break;
case SEEK_CUR:
/*
@@ -97,23 +112,17 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
* f_pos value back to the file because a concurrent read(),
* write() or lseek() might have altered it
*/
- if (offset == 0)
- return file->f_pos;
- /*
- * f_lock protects against read/modify/write race with other
- * SEEK_CURs. Note that parallel writes and reads behave
- * like SEEK_SET.
- */
- spin_lock(&file->f_lock);
- offset = vfs_setpos(file, file->f_pos + offset, maxsize);
- spin_unlock(&file->f_lock);
- return offset;
+ if (*offset == 0) {
+ *offset = file->f_pos;
+ return 0;
+ }
+ break;
case SEEK_DATA:
/*
* In the generic case the entire file is data, so as long as
* offset isn't at the end of the file then the offset is data.
*/
- if ((unsigned long long)offset >= eof)
+ if ((unsigned long long)*offset >= eof)
return -ENXIO;
break;
case SEEK_HOLE:
@@ -121,17 +130,103 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
* There is a virtual hole at the end of the file, so as long as
* offset isn't i_size or larger, return i_size.
*/
- if ((unsigned long long)offset >= eof)
+ if ((unsigned long long)*offset >= eof)
return -ENXIO;
- offset = eof;
+ *offset = eof;
break;
}
+ return 1;
+}
+
+/**
+ * generic_file_llseek_size - generic llseek implementation for regular files
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @maxsize: max size of this file in file system
+ * @eof: offset used for SEEK_END position
+ *
+ * This is a variant of generic_file_llseek that allows passing in a custom
+ * maximum file size and a custom EOF position, for e.g. hashed directories
+ *
+ * Synchronization:
+ * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
+ * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
+ * read/writes behave like SEEK_SET against seeks.
+ */
+loff_t
+generic_file_llseek_size(struct file *file, loff_t offset, int whence,
+ loff_t maxsize, loff_t eof)
+{
+ int ret;
+
+ ret = must_set_pos(file, &offset, whence, eof);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return offset;
+
+ if (whence == SEEK_CUR) {
+ /*
+ * f_lock protects against read/modify/write race with
+ * other SEEK_CURs. Note that parallel writes and reads
+ * behave like SEEK_SET.
+ */
+ guard(spinlock)(&file->f_lock);
+ return vfs_setpos(file, file->f_pos + offset, maxsize);
+ }
+
return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);
/**
+ * generic_llseek_cookie - versioned llseek implementation
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @cookie: cookie to update
+ *
+ * See generic_file_llseek for a general description and locking assumptions.
+ *
+ * In contrast to generic_file_llseek, this function also resets a
+ * specified cookie to indicate a seek took place.
+ */
+loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+ u64 *cookie)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t maxsize = inode->i_sb->s_maxbytes;
+ loff_t eof = i_size_read(inode);
+ int ret;
+
+ if (WARN_ON_ONCE(!cookie))
+ return -EINVAL;
+
+ /*
+ * Require that this is only used for directories that guarantee
+ * synchronization between readdir and seek so that an update to
+ * @cookie is correctly synchronized with concurrent readdir.
+ */
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
+ return -EINVAL;
+
+ ret = must_set_pos(file, &offset, whence, eof);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return offset;
+
+ /* No need to hold f_lock because we know that f_pos_lock is held. */
+ if (whence == SEEK_CUR)
+ return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);
+
+ return vfs_setpos_cookie(file, offset, maxsize, cookie);
+}
+EXPORT_SYMBOL(generic_llseek_cookie);
+
+/**
* generic_file_llseek - generic llseek implementation for regular files
* @file: file structure to seek on
* @offset: file offset to seek to
@@ -270,10 +365,8 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
}
retval = -EINVAL;
if (offset >= 0 || unsigned_offsets(file)) {
- if (offset != file->f_pos) {
+ if (offset != file->f_pos)
file->f_pos = offset;
- file->f_version = 0;
- }
retval = offset;
}
out:
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9b43a81a6488..72c53129c952 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2178,7 +2178,7 @@ static int grab_tail_page(struct inode *inode,
unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
struct buffer_head *bh;
struct buffer_head *head;
- struct page *page;
+ struct folio *folio;
int error;
/*
@@ -2190,20 +2190,20 @@ static int grab_tail_page(struct inode *inode,
if ((offset & (blocksize - 1)) == 0) {
return -ENOENT;
}
- page = grab_cache_page(inode->i_mapping, index);
- error = -ENOMEM;
- if (!page) {
- goto out;
- }
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(inode->i_mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/* start within the page of the last block in the file */
start = (offset / blocksize) * blocksize;
- error = __block_write_begin(page, start, offset - start,
+ error = __block_write_begin(folio, start, offset - start,
reiserfs_get_block_create_0);
if (error)
goto unlock;
- head = page_buffers(page);
+ head = folio_buffers(folio);
bh = head;
do {
if (pos >= start) {
@@ -2226,14 +2226,13 @@ static int grab_tail_page(struct inode *inode,
goto unlock;
}
*bh_result = bh;
- *page_result = page;
+ *page_result = &folio->page;
-out:
return error;
unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return error;
}
@@ -2736,23 +2735,24 @@ static void reiserfs_truncate_failed_write(struct inode *inode)
static int reiserfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode;
- struct page *page;
+ struct folio *folio;
pgoff_t index;
int ret;
int old_ref = 0;
inode = mapping->host;
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *foliop = folio;
reiserfs_wait_on_write_block(inode->i_sb);
- fix_tail_page_for_writing(page);
+ fix_tail_page_for_writing(&folio->page);
if (reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th;
th = (struct reiserfs_transaction_handle *)current->
@@ -2762,7 +2762,7 @@ static int reiserfs_write_begin(struct file *file,
old_ref = th->t_refcount;
th->t_refcount++;
}
- ret = __block_write_begin(page, pos, len, reiserfs_get_block);
+ ret = __block_write_begin(folio, pos, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
/*
@@ -2792,8 +2792,8 @@ static int reiserfs_write_begin(struct file *file,
}
}
if (ret) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
/* Truncate allocated blocks */
reiserfs_truncate_failed_write(inode);
}
@@ -2822,7 +2822,7 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
th->t_refcount++;
}
- ret = __block_write_begin(page, from, len, reiserfs_get_block);
+ ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
/*
@@ -2862,10 +2862,9 @@ static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
static int reiserfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
int ret = 0;
int update_sd = 0;
struct reiserfs_transaction_handle *th;
@@ -2887,7 +2886,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
}
flush_dcache_folio(folio);
- reiserfs_commit_page(inode, page, start, start + copied);
+ reiserfs_commit_page(inode, &folio->page, start, start + copied);
/*
* generic_commit_write does this for us, but does not update the
@@ -2942,8 +2941,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
out:
if (locked)
reiserfs_write_unlock(inode->i_sb);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (pos + len > inode->i_size)
reiserfs_truncate_failed_write(inode);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 68758b6fed94..0addcc849ff2 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -126,7 +126,7 @@ static int romfs_read_folio(struct file *file, struct folio *folio)
}
}
- buf = folio_zero_tail(folio, fillsize, buf);
+ buf = folio_zero_tail(folio, fillsize, buf + fillsize);
kunmap_local(buf);
folio_end_read(folio, ret == 0);
return ret;
diff --git a/fs/select.c b/fs/select.c
index 9515c3fa1a03..cae82e9e0dcc 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -77,19 +77,16 @@ u64 select_estimate_accuracy(struct timespec64 *tv)
{
u64 ret;
struct timespec64 now;
+ u64 slack = current->timer_slack_ns;
- /*
- * Realtime tasks get a slack of 0 for obvious reasons.
- */
-
- if (rt_task(current))
+ if (slack == 0)
return 0;
ktime_get_ts64(&now);
now = timespec64_sub(*tv, now);
ret = __estimate_accuracy(&now);
- if (ret < current->timer_slack_ns)
- return current->timer_slack_ns;
+ if (ret < slack)
+ return slack;
return ret;
}
@@ -840,7 +837,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
struct poll_list {
struct poll_list *next;
unsigned int len;
- struct pollfd entries[];
+ struct pollfd entries[] __counted_by(len);
};
#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
diff --git a/fs/signalfd.c b/fs/signalfd.c
index ec7b2da2477a..d0333bce015e 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -159,7 +159,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
DECLARE_WAITQUEUE(wait, current);
spin_lock_irq(&current->sighand->siglock);
- ret = dequeue_signal(current, &ctx->sigmask, info, &type);
+ ret = dequeue_signal(&ctx->sigmask, info, &type);
switch (ret) {
case 0:
if (!nonblock)
@@ -174,7 +174,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info
add_wait_queue(&current->sighand->signalfd_wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- ret = dequeue_signal(current, &ctx->sigmask, info, &type);
+ ret = dequeue_signal(&ctx->sigmask, info, &type);
if (ret != 0)
break;
if (signal_pending(current)) {
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 2517dc242386..2aff6d1395ce 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -204,4 +204,18 @@ config CIFS_ROOT
Most people say N here.
+config CIFS_COMPRESSION
+ bool "SMB message compression (Experimental)"
+ depends on CIFS
+ default n
+ help
+ Enables over-the-wire message compression for SMB 3.1.1
+ mounts when negotiated with the server.
+
+ Only write requests with data size >= PAGE_SIZE will be
+ compressed to avoid wasting resources.
+
+ Say Y here if you want SMB traffic to be compressed.
+ If unsure, say N.
+
endif
diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile
index e11985f2460b..22023e30915b 100644
--- a/fs/smb/client/Makefile
+++ b/fs/smb/client/Makefile
@@ -33,3 +33,5 @@ cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o
cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o cifssmb.o
+
+cifs-$(CONFIG_CIFS_COMPRESSION) += compress.o compress/lz77.o
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index c71ae5c04306..e03c890de0a0 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -350,6 +350,9 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CIFS_SWN_UPCALL
seq_puts(m, ",WITNESS");
#endif
+#ifdef CONFIG_CIFS_COMPRESSION
+ seq_puts(m, ",COMPRESSION");
+#endif
seq_putc(m, '\n');
seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize);
seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
@@ -475,7 +478,9 @@ skip_rdma:
}
seq_puts(m, "\nCompression: ");
- if (!server->compression.requested)
+ if (!IS_ENABLED(CONFIG_CIFS_COMPRESSION))
+ seq_puts(m, "no built-in support");
+ else if (!server->compression.requested)
seq_puts(m, "disabled on mount");
else if (server->compression.enabled)
seq_printf(m, "enabled (%s)", compression_alg_str(server->compression.alg));
@@ -1072,7 +1077,7 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
static void
cifs_security_flags_handle_must_flags(unsigned int *flags)
{
- unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
+ unsigned int signflags = *flags & (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL);
if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
*flags = CIFSSEC_MUST_KRB5;
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index f5b6df82e857..1d294d53f662 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -27,18 +27,18 @@
#include "cifs_unicode.h"
/* security id for everyone/world system group */
-static const struct cifs_sid sid_everyone = {
+static const struct smb_sid sid_everyone = {
1, 1, {0, 0, 0, 0, 0, 1}, {0} };
/* security id for Authenticated Users system group */
-static const struct cifs_sid sid_authusers = {
+static const struct smb_sid sid_authusers = {
1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
/* S-1-22-1 Unmapped Unix users */
-static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22},
+static const struct smb_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22},
{cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
/* S-1-22-2 Unmapped Unix groups */
-static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22},
+static const struct smb_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22},
{cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
/*
@@ -48,17 +48,17 @@ static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22},
/* S-1-5-88 MS NFS and Apple style UID/GID/mode */
/* S-1-5-88-1 Unix uid */
-static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5},
+static const struct smb_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5},
{cpu_to_le32(88),
cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
/* S-1-5-88-2 Unix gid */
-static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5},
+static const struct smb_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5},
{cpu_to_le32(88),
cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
/* S-1-5-88-3 Unix mode */
-static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5},
+static const struct smb_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5},
{cpu_to_le32(88),
cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} };
@@ -106,7 +106,7 @@ static struct key_type cifs_idmap_key_type = {
};
static char *
-sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
+sid_to_key_str(struct smb_sid *sidptr, unsigned int type)
{
int i, len;
unsigned int saval;
@@ -158,7 +158,7 @@ sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
* the same returns zero, if they do not match returns non-zero.
*/
static int
-compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
+compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid)
{
int i;
int num_subauth, num_sat, num_saw;
@@ -187,7 +187,7 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
/* compare all of the subauth values if any */
num_sat = ctsid->num_subauth;
num_saw = cwsid->num_subauth;
- num_subauth = num_sat < num_saw ? num_sat : num_saw;
+ num_subauth = min(num_sat, num_saw);
if (num_subauth) {
for (i = 0; i < num_subauth; ++i) {
if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
@@ -204,11 +204,11 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
}
static bool
-is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group)
+is_well_known_sid(const struct smb_sid *psid, uint32_t *puid, bool is_group)
{
int i;
int num_subauth;
- const struct cifs_sid *pwell_known_sid;
+ const struct smb_sid *pwell_known_sid;
if (!psid || (puid == NULL))
return false;
@@ -260,7 +260,7 @@ is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group)
}
static __u16
-cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
+cifs_copy_sid(struct smb_sid *dst, const struct smb_sid *src)
{
int i;
__u16 size = 1 + 1 + 6;
@@ -277,11 +277,11 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
}
static int
-id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
+id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid)
{
int rc;
struct key *sidkey;
- struct cifs_sid *ksid;
+ struct smb_sid *ksid;
unsigned int ksid_size;
char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
const struct cred *saved_cred;
@@ -312,8 +312,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
* it could be.
*/
ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
- (struct cifs_sid *)&sidkey->payload :
- (struct cifs_sid *)sidkey->payload.data[0];
+ (struct smb_sid *)&sidkey->payload :
+ (struct smb_sid *)sidkey->payload.data[0];
ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
if (ksid_size > sidkey->datalen) {
@@ -336,7 +336,7 @@ invalidate_key:
}
int
-sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
int rc = 0;
@@ -515,43 +515,43 @@ exit_cifs_idmap(void)
}
/* copy ntsd, owner sid, and group sid from a security descriptor to another */
-static __u32 copy_sec_desc(const struct cifs_ntsd *pntsd,
- struct cifs_ntsd *pnntsd,
+static __u32 copy_sec_desc(const struct smb_ntsd *pntsd,
+ struct smb_ntsd *pnntsd,
__u32 sidsoffset,
- struct cifs_sid *pownersid,
- struct cifs_sid *pgrpsid)
+ struct smb_sid *pownersid,
+ struct smb_sid *pgrpsid)
{
- struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
- struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
+ struct smb_sid *owner_sid_ptr, *group_sid_ptr;
+ struct smb_sid *nowner_sid_ptr, *ngroup_sid_ptr;
/* copy security descriptor control portion */
pnntsd->revision = pntsd->revision;
pnntsd->type = pntsd->type;
- pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd));
+ pnntsd->dacloffset = cpu_to_le32(sizeof(struct smb_ntsd));
pnntsd->sacloffset = 0;
pnntsd->osidoffset = cpu_to_le32(sidsoffset);
- pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid));
+ pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct smb_sid));
/* copy owner sid */
if (pownersid)
owner_sid_ptr = pownersid;
else
- owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ owner_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
- nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
+ nowner_sid_ptr = (struct smb_sid *)((char *)pnntsd + sidsoffset);
cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
/* copy group sid */
if (pgrpsid)
group_sid_ptr = pgrpsid;
else
- group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ group_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
- ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
- sizeof(struct cifs_sid));
+ ngroup_sid_ptr = (struct smb_sid *)((char *)pnntsd + sidsoffset +
+ sizeof(struct smb_sid));
cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
- return sidsoffset + (2 * sizeof(struct cifs_sid));
+ return sidsoffset + (2 * sizeof(struct smb_sid));
}
@@ -666,7 +666,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
return;
}
-static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct cifs_sid *psid)
+static __u16 cifs_copy_ace(struct smb_ace *dst, struct smb_ace *src, struct smb_sid *psid)
{
__u16 size = 1 + 1 + 2 + 4;
@@ -685,8 +685,8 @@ static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct ci
return size;
}
-static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
- const struct cifs_sid *psid, __u64 nmode,
+static __u16 fill_ace_for_sid(struct smb_ace *pntace,
+ const struct smb_sid *psid, __u64 nmode,
umode_t bits, __u8 access_type,
bool allow_delete_child)
{
@@ -723,7 +723,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
#ifdef CONFIG_CIFS_DEBUG2
-static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
+static void dump_ace(struct smb_ace *pace, char *end_of_acl)
{
int num_subauth;
@@ -758,15 +758,15 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
}
#endif
-static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
- struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
+static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl,
+ struct smb_sid *pownersid, struct smb_sid *pgrpsid,
struct cifs_fattr *fattr, bool mode_from_special_sid)
{
int i;
int num_aces = 0;
int acl_size;
char *acl_base;
- struct cifs_ace **ppace;
+ struct smb_ace **ppace;
/* BB need to add parm so we can store the SID BB */
@@ -793,21 +793,21 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
fattr->cf_mode &= ~(0777);
acl_base = (char *)pdacl;
- acl_size = sizeof(struct cifs_acl);
+ acl_size = sizeof(struct smb_acl);
num_aces = le32_to_cpu(pdacl->num_aces);
if (num_aces > 0) {
umode_t denied_mode = 0;
- if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+ if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
return;
- ppace = kmalloc_array(num_aces, sizeof(struct cifs_ace *),
+ ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *),
GFP_KERNEL);
if (!ppace)
return;
for (i = 0; i < num_aces; ++i) {
- ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
+ ppace[i] = (struct smb_ace *) (acl_base + acl_size);
#ifdef CONFIG_CIFS_DEBUG2
dump_ace(ppace[i], end_of_acl);
#endif
@@ -849,7 +849,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
/* memcpy((void *)(&(cifscred->aces[i])),
(void *)ppace[i],
- sizeof(struct cifs_ace)); */
+ sizeof(struct smb_ace)); */
acl_base = (char *)ppace[i];
acl_size = le16_to_cpu(ppace[i]->size);
@@ -861,7 +861,7 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
return;
}
-unsigned int setup_authusers_ACE(struct cifs_ace *pntace)
+unsigned int setup_authusers_ACE(struct smb_ace *pntace)
{
int i;
unsigned int ace_size = 20;
@@ -885,7 +885,7 @@ unsigned int setup_authusers_ACE(struct cifs_ace *pntace)
* Fill in the special SID based on the mode. See
* https://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx
*/
-unsigned int setup_special_mode_ACE(struct cifs_ace *pntace, __u64 nmode)
+unsigned int setup_special_mode_ACE(struct smb_ace *pntace, __u64 nmode)
{
int i;
unsigned int ace_size = 28;
@@ -907,7 +907,7 @@ unsigned int setup_special_mode_ACE(struct cifs_ace *pntace, __u64 nmode)
return ace_size;
}
-unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace)
+unsigned int setup_special_user_owner_ACE(struct smb_ace *pntace)
{
int i;
unsigned int ace_size = 28;
@@ -930,8 +930,8 @@ unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace)
}
static void populate_new_aces(char *nacl_base,
- struct cifs_sid *pownersid,
- struct cifs_sid *pgrpsid,
+ struct smb_sid *pownersid,
+ struct smb_sid *pgrpsid,
__u64 *pnmode, u32 *pnum_aces, u16 *pnsize,
bool modefromsid)
{
@@ -944,17 +944,17 @@ static void populate_new_aces(char *nacl_base,
__u64 deny_user_mode = 0;
__u64 deny_group_mode = 0;
bool sticky_set = false;
- struct cifs_ace *pnntace = NULL;
+ struct smb_ace *pnntace = NULL;
nmode = *pnmode;
num_aces = *pnum_aces;
nsize = *pnsize;
if (modefromsid) {
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += setup_special_mode_ACE(pnntace, nmode);
num_aces++;
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += setup_authusers_ACE(pnntace);
num_aces++;
goto set_size;
@@ -967,7 +967,7 @@ static void populate_new_aces(char *nacl_base,
* updated in the inode.
*/
- if (!memcmp(pownersid, pgrpsid, sizeof(struct cifs_sid))) {
+ if (!memcmp(pownersid, pgrpsid, sizeof(struct smb_sid))) {
/*
* Case when owner and group SIDs are the same.
* Set the more restrictive of the two modes.
@@ -992,7 +992,7 @@ static void populate_new_aces(char *nacl_base,
sticky_set = true;
if (deny_user_mode) {
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += fill_ace_for_sid(pnntace, pownersid, deny_user_mode,
0700, ACCESS_DENIED, false);
num_aces++;
@@ -1000,31 +1000,31 @@ static void populate_new_aces(char *nacl_base,
/* Group DENY ACE does not conflict with owner ALLOW ACE. Keep in preferred order*/
if (deny_group_mode && !(deny_group_mode & (user_mode >> 3))) {
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode,
0070, ACCESS_DENIED, false);
num_aces++;
}
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += fill_ace_for_sid(pnntace, pownersid, user_mode,
0700, ACCESS_ALLOWED, true);
num_aces++;
/* Group DENY ACE conflicts with owner ALLOW ACE. So keep it after. */
if (deny_group_mode && (deny_group_mode & (user_mode >> 3))) {
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode,
0070, ACCESS_DENIED, false);
num_aces++;
}
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += fill_ace_for_sid(pnntace, pgrpsid, group_mode,
0070, ACCESS_ALLOWED, !sticky_set);
num_aces++;
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += fill_ace_for_sid(pnntace, &sid_everyone, other_mode,
0007, ACCESS_ALLOWED, !sticky_set);
num_aces++;
@@ -1034,31 +1034,31 @@ set_size:
*pnsize = nsize;
}
-static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
- struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
- struct cifs_sid *pnownersid, struct cifs_sid *pngrpsid)
+static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *pndacl,
+ struct smb_sid *pownersid, struct smb_sid *pgrpsid,
+ struct smb_sid *pnownersid, struct smb_sid *pngrpsid)
{
int i;
u16 size = 0;
- struct cifs_ace *pntace = NULL;
+ struct smb_ace *pntace = NULL;
char *acl_base = NULL;
u32 src_num_aces = 0;
u16 nsize = 0;
- struct cifs_ace *pnntace = NULL;
+ struct smb_ace *pnntace = NULL;
char *nacl_base = NULL;
u16 ace_size = 0;
acl_base = (char *)pdacl;
- size = sizeof(struct cifs_acl);
+ size = sizeof(struct smb_acl);
src_num_aces = le32_to_cpu(pdacl->num_aces);
nacl_base = (char *)pndacl;
- nsize = sizeof(struct cifs_acl);
+ nsize = sizeof(struct smb_acl);
/* Go through all the ACEs */
for (i = 0; i < src_num_aces; ++i) {
- pntace = (struct cifs_ace *) (acl_base + size);
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pntace = (struct smb_ace *) (acl_base + size);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
if (pnownersid && compare_sids(&pntace->sid, pownersid) == 0)
ace_size = cifs_copy_ace(pnntace, pntace, pnownersid);
@@ -1074,24 +1074,24 @@ static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl
return nsize;
}
-static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
- struct cifs_sid *pownersid, struct cifs_sid *pgrpsid,
+static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl,
+ struct smb_sid *pownersid, struct smb_sid *pgrpsid,
__u64 *pnmode, bool mode_from_sid)
{
int i;
u16 size = 0;
- struct cifs_ace *pntace = NULL;
+ struct smb_ace *pntace = NULL;
char *acl_base = NULL;
u32 src_num_aces = 0;
u16 nsize = 0;
- struct cifs_ace *pnntace = NULL;
+ struct smb_ace *pnntace = NULL;
char *nacl_base = NULL;
u32 num_aces = 0;
bool new_aces_set = false;
/* Assuming that pndacl and pnmode are never NULL */
nacl_base = (char *)pndacl;
- nsize = sizeof(struct cifs_acl);
+ nsize = sizeof(struct smb_acl);
/* If pdacl is NULL, we don't have a src. Simply populate new ACL. */
if (!pdacl) {
@@ -1103,12 +1103,12 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
}
acl_base = (char *)pdacl;
- size = sizeof(struct cifs_acl);
+ size = sizeof(struct smb_acl);
src_num_aces = le32_to_cpu(pdacl->num_aces);
/* Retain old ACEs which we can retain */
for (i = 0; i < src_num_aces; ++i) {
- pntace = (struct cifs_ace *) (acl_base + size);
+ pntace = (struct smb_ace *) (acl_base + size);
if (!new_aces_set && (pntace->flags & INHERITED_ACE)) {
/* Place the new ACEs in between existing explicit and inherited */
@@ -1130,7 +1130,7 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
}
/* update the pointer to the next ACE to populate*/
- pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ pnntace = (struct smb_ace *) (nacl_base + nsize);
nsize += cifs_copy_ace(pnntace, pntace, NULL);
num_aces++;
@@ -1156,7 +1156,7 @@ finalize_dacl:
return 0;
}
-static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
+static int parse_sid(struct smb_sid *psid, char *end_of_acl)
{
/* BB need to add parm so we can store the SID BB */
@@ -1191,24 +1191,24 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
/* Convert CIFS ACL to POSIX form */
static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
- struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr,
+ struct smb_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr,
bool get_mode_from_special_sid)
{
int rc = 0;
- struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
- struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
+ struct smb_sid *owner_sid_ptr, *group_sid_ptr;
+ struct smb_acl *dacl_ptr; /* no need for SACL ptr */
char *end_of_acl = ((char *)pntsd) + acl_len;
__u32 dacloffset;
if (pntsd == NULL)
return -EIO;
- owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ owner_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
- group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ group_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
dacloffset = le32_to_cpu(pntsd->dacloffset);
- dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+ dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n",
pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
le32_to_cpu(pntsd->gsidoffset),
@@ -1249,7 +1249,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
}
/* Convert permission bits from mode to equivalent CIFS ACL */
-static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
+static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd,
__u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid,
bool mode_from_sid, bool id_from_sid, int *aclflag)
{
@@ -1257,30 +1257,30 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
__u32 dacloffset;
__u32 ndacloffset;
__u32 sidsoffset;
- struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
- struct cifs_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL;
- struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */
- struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
+ struct smb_sid *owner_sid_ptr, *group_sid_ptr;
+ struct smb_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL;
+ struct smb_acl *dacl_ptr = NULL; /* no need for SACL ptr */
+ struct smb_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
char *end_of_acl = ((char *)pntsd) + secdesclen;
u16 size = 0;
dacloffset = le32_to_cpu(pntsd->dacloffset);
if (dacloffset) {
- dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+ dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
if (end_of_acl < (char *)dacl_ptr + le16_to_cpu(dacl_ptr->size)) {
cifs_dbg(VFS, "Server returned illegal ACL size\n");
return -EINVAL;
}
}
- owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ owner_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
- group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
+ group_sid_ptr = (struct smb_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
- ndacloffset = sizeof(struct cifs_ntsd);
- ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+ ndacloffset = sizeof(struct smb_ntsd);
+ ndacl_ptr = (struct smb_acl *)((char *)pnntsd + ndacloffset);
ndacl_ptr->revision =
dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION);
@@ -1297,15 +1297,15 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
*aclflag |= CIFS_ACL_DACL;
} else {
- ndacloffset = sizeof(struct cifs_ntsd);
- ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
+ ndacloffset = sizeof(struct smb_ntsd);
+ ndacl_ptr = (struct smb_acl *)((char *)pnntsd + ndacloffset);
ndacl_ptr->revision =
dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION);
ndacl_ptr->num_aces = dacl_ptr ? dacl_ptr->num_aces : 0;
if (uid_valid(uid)) { /* chown */
uid_t id;
- nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid),
+ nowner_sid_ptr = kzalloc(sizeof(struct smb_sid),
GFP_KERNEL);
if (!nowner_sid_ptr) {
rc = -ENOMEM;
@@ -1334,7 +1334,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
}
if (gid_valid(gid)) { /* chgrp */
gid_t id;
- ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid),
+ ngroup_sid_ptr = kzalloc(sizeof(struct smb_sid),
GFP_KERNEL);
if (!ngroup_sid_ptr) {
rc = -ENOMEM;
@@ -1385,11 +1385,11 @@ chown_chgrp_exit:
}
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
-struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen,
u32 __maybe_unused unused)
{
- struct cifs_ntsd *pntsd = NULL;
+ struct smb_ntsd *pntsd = NULL;
unsigned int xid;
int rc;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -1410,10 +1410,10 @@ struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
return pntsd;
}
-static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
const char *path, u32 *pacllen)
{
- struct cifs_ntsd *pntsd = NULL;
+ struct smb_ntsd *pntsd = NULL;
int oplock = 0;
unsigned int xid;
int rc;
@@ -1454,11 +1454,11 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
}
/* Retrieve an ACL from the server */
-struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
struct inode *inode, const char *path,
u32 *pacllen, u32 info)
{
- struct cifs_ntsd *pntsd = NULL;
+ struct smb_ntsd *pntsd = NULL;
struct cifsFileInfo *open_file = NULL;
if (inode)
@@ -1472,7 +1472,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
}
/* Set an ACL on the server */
-int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen,
struct inode *inode, const char *path, int aclflag)
{
int oplock = 0;
@@ -1528,7 +1528,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
struct inode *inode, bool mode_from_special_sid,
const char *path, const struct cifs_fid *pfid)
{
- struct cifs_ntsd *pntsd = NULL;
+ struct smb_ntsd *pntsd = NULL;
u32 acllen = 0;
int rc = 0;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -1580,9 +1580,9 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
__u32 secdesclen = 0;
__u32 nsecdesclen = 0;
__u32 dacloffset = 0;
- struct cifs_acl *dacl_ptr = NULL;
- struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
- struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+ struct smb_acl *dacl_ptr = NULL;
+ struct smb_ntsd *pntsd = NULL; /* acl obtained from server */
+ struct smb_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
struct smb_version_operations *ops;
@@ -1625,18 +1625,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
nsecdesclen = secdesclen;
if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
if (mode_from_sid)
- nsecdesclen += 2 * sizeof(struct cifs_ace);
+ nsecdesclen += 2 * sizeof(struct smb_ace);
else /* cifsacl */
- nsecdesclen += 5 * sizeof(struct cifs_ace);
+ nsecdesclen += 5 * sizeof(struct smb_ace);
} else { /* chown */
/* When ownership changes, changes new owner sid length could be different */
- nsecdesclen = sizeof(struct cifs_ntsd) + (sizeof(struct cifs_sid) * 2);
+ nsecdesclen = sizeof(struct smb_ntsd) + (sizeof(struct smb_sid) * 2);
dacloffset = le32_to_cpu(pntsd->dacloffset);
if (dacloffset) {
- dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
+ dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
if (mode_from_sid)
nsecdesclen +=
- le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct cifs_ace);
+ le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace);
else /* cifsacl */
nsecdesclen += le16_to_cpu(dacl_ptr->size);
}
diff --git a/fs/smb/client/cifsacl.h b/fs/smb/client/cifsacl.h
index ccbfc754bd3c..6529478b7f48 100644
--- a/fs/smb/client/cifsacl.h
+++ b/fs/smb/client/cifsacl.h
@@ -9,8 +9,7 @@
#ifndef _CIFSACL_H
#define _CIFSACL_H
-#define NUM_AUTHS (6) /* number of authority fields */
-#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
+#include "../common/smbacl.h"
#define READ_BIT 0x4
#define WRITE_BIT 0x2
@@ -23,101 +22,13 @@
#define UBITSHIFT 6
#define GBITSHIFT 3
-#define ACCESS_ALLOWED 0
-#define ACCESS_DENIED 1
-
-#define SIDOWNER 1
-#define SIDGROUP 2
-
/*
* Security Descriptor length containing DACL with 3 ACEs (one each for
* owner, group and world).
*/
-#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
- sizeof(struct cifs_acl) + \
- (sizeof(struct cifs_ace) * 4))
-
-/*
- * Maximum size of a string representation of a SID:
- *
- * The fields are unsigned values in decimal. So:
- *
- * u8: max 3 bytes in decimal
- * u32: max 10 bytes in decimal
- *
- * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
- *
- * For authority field, max is when all 6 values are non-zero and it must be
- * represented in hex. So "-0x" + 12 hex digits.
- *
- * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
- */
-#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
-#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
-
-struct cifs_ntsd {
- __le16 revision; /* revision level */
- __le16 type;
- __le32 osidoffset;
- __le32 gsidoffset;
- __le32 sacloffset;
- __le32 dacloffset;
-} __attribute__((packed));
-
-struct cifs_sid {
- __u8 revision; /* revision level */
- __u8 num_subauth;
- __u8 authority[NUM_AUTHS];
- __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
-} __attribute__((packed));
-
-/* size of a struct cifs_sid, sans sub_auth array */
-#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
-
-struct cifs_acl {
- __le16 revision; /* revision level */
- __le16 size;
- __le32 num_aces;
-} __attribute__((packed));
-
-/* ACE types - see MS-DTYP 2.4.4.1 */
-#define ACCESS_ALLOWED_ACE_TYPE 0x00
-#define ACCESS_DENIED_ACE_TYPE 0x01
-#define SYSTEM_AUDIT_ACE_TYPE 0x02
-#define SYSTEM_ALARM_ACE_TYPE 0x03
-#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
-#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
-#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
-#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
-#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
-#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
-#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
-#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
-#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
-#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
-#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */
-#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
-#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */
-#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11
-#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12
-#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13
-
-/* ACE flags */
-#define OBJECT_INHERIT_ACE 0x01
-#define CONTAINER_INHERIT_ACE 0x02
-#define NO_PROPAGATE_INHERIT_ACE 0x04
-#define INHERIT_ONLY_ACE 0x08
-#define INHERITED_ACE 0x10
-#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40
-#define FAILED_ACCESS_ACE_FLAG 0x80
-
-struct cifs_ace {
- __u8 type; /* see above and MS-DTYP 2.4.4.1 */
- __u8 flags;
- __le16 size;
- __le32 access_req;
- struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
-} __attribute__((packed));
+#define DEFAULT_SEC_DESC_LEN (sizeof(struct smb_ntsd) + \
+ sizeof(struct smb_acl) + \
+ (sizeof(struct smb_ace) * 4))
/*
* The current SMB3 form of security descriptor is similar to what was used for
@@ -194,6 +105,6 @@ struct owner_group_sids {
* Minimum security descriptor can be one without any SACL and DACL and can
* consist of revision, type, and two sids of minimum size for owner and group
*/
-#define MIN_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + (2 * MIN_SID_LEN))
+#define MIN_SEC_DESC_LEN (sizeof(struct smb_ntsd) + (2 * MIN_SID_LEN))
#endif /* _CIFSACL_H */
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 6322f0f68a17..7481b21a0489 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -21,127 +21,21 @@
#include <linux/random.h>
#include <linux/highmem.h>
#include <linux/fips.h>
+#include <linux/iov_iter.h>
#include "../common/arc4.h"
#include <crypto/aead.h>
-/*
- * Hash data from a BVEC-type iterator.
- */
-static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize,
- struct shash_desc *shash)
+static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len,
+ void *priv, void *priv2)
{
- const struct bio_vec *bv = iter->bvec;
- unsigned long start = iter->iov_offset;
- unsigned int i;
- void *p;
- int ret;
-
- for (i = 0; i < iter->nr_segs; i++) {
- size_t off, len;
-
- len = bv[i].bv_len;
- if (start >= len) {
- start -= len;
- continue;
- }
-
- len = min_t(size_t, maxsize, len - start);
- off = bv[i].bv_offset + start;
+ struct shash_desc *shash = priv;
+ int ret, *pret = priv2;
- p = kmap_local_page(bv[i].bv_page);
- ret = crypto_shash_update(shash, p + off, len);
- kunmap_local(p);
- if (ret < 0)
- return ret;
-
- maxsize -= len;
- if (maxsize <= 0)
- break;
- start = 0;
+ ret = crypto_shash_update(shash, iter_base, len);
+ if (ret < 0) {
+ *pret = ret;
+ return len;
}
-
- return 0;
-}
-
-/*
- * Hash data from a KVEC-type iterator.
- */
-static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize,
- struct shash_desc *shash)
-{
- const struct kvec *kv = iter->kvec;
- unsigned long start = iter->iov_offset;
- unsigned int i;
- int ret;
-
- for (i = 0; i < iter->nr_segs; i++) {
- size_t len;
-
- len = kv[i].iov_len;
- if (start >= len) {
- start -= len;
- continue;
- }
-
- len = min_t(size_t, maxsize, len - start);
- ret = crypto_shash_update(shash, kv[i].iov_base + start, len);
- if (ret < 0)
- return ret;
- maxsize -= len;
-
- if (maxsize <= 0)
- break;
- start = 0;
- }
-
- return 0;
-}
-
-/*
- * Hash data from an XARRAY-type iterator.
- */
-static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize,
- struct shash_desc *shash)
-{
- struct folio *folios[16], *folio;
- unsigned int nr, i, j, npages;
- loff_t start = iter->xarray_start + iter->iov_offset;
- pgoff_t last, index = start / PAGE_SIZE;
- ssize_t ret = 0;
- size_t len, offset, foffset;
- void *p;
-
- if (maxsize == 0)
- return 0;
-
- last = (start + maxsize - 1) / PAGE_SIZE;
- do {
- nr = xa_extract(iter->xarray, (void **)folios, index, last,
- ARRAY_SIZE(folios), XA_PRESENT);
- if (nr == 0)
- return -EIO;
-
- for (i = 0; i < nr; i++) {
- folio = folios[i];
- npages = folio_nr_pages(folio);
- foffset = start - folio_pos(folio);
- offset = foffset % PAGE_SIZE;
- for (j = foffset / PAGE_SIZE; j < npages; j++) {
- len = min_t(size_t, maxsize, PAGE_SIZE - offset);
- p = kmap_local_page(folio_page(folio, j));
- ret = crypto_shash_update(shash, p, len);
- kunmap_local(p);
- if (ret < 0)
- return ret;
- maxsize -= len;
- if (maxsize <= 0)
- return 0;
- start += len;
- offset = 0;
- index++;
- }
- }
- } while (nr == ARRAY_SIZE(folios));
return 0;
}
@@ -151,21 +45,13 @@ static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize,
static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize,
struct shash_desc *shash)
{
- if (maxsize == 0)
- return 0;
+ struct iov_iter tmp_iter = *iter;
+ int err = -EIO;
- switch (iov_iter_type(iter)) {
- case ITER_BVEC:
- return cifs_shash_bvec(iter, maxsize, shash);
- case ITER_KVEC:
- return cifs_shash_kvec(iter, maxsize, shash);
- case ITER_XARRAY:
- return cifs_shash_xarray(iter, maxsize, shash);
- default:
- pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter));
- WARN_ON_ONCE(1);
- return -EIO;
- }
+ if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err,
+ cifs_shash_step) != maxsize)
+ return err;
+ return 0;
}
int __cifs_calc_signature(struct smb_rqst *rqst,
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2c4b357d85e2..2a2523c93944 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -75,9 +75,9 @@ unsigned int sign_CIFS_PDUs = 1;
/*
* Global transaction id (XID) information
*/
-unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
-unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
-unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
+unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */
+unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */
+unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */
spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
/*
@@ -1341,7 +1341,6 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
struct cifs_tcon *target_tcon;
- unsigned long long destend, fstart, fend;
ssize_t rc;
cifs_dbg(FYI, "copychunk range\n");
@@ -1391,25 +1390,13 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
goto unlock;
}
- destend = destoff + len - 1;
-
- /* Flush the folios at either end of the destination range to prevent
- * accidental loss of dirty data outside of the range.
+ /* Flush and invalidate all the folios in the destination region. If
+ * the copy was successful, then some of the flush is extra overhead,
+ * but we need to allow for the copy failing in some way (eg. ENOSPC).
*/
- fstart = destoff;
- fend = destend;
-
- rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
+ rc = filemap_invalidate_inode(target_inode, true, destoff, destoff + len - 1);
if (rc)
goto unlock;
- rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
- if (rc)
- goto unlock;
- if (fend > target_cifsi->netfs.zero_point)
- target_cifsi->netfs.zero_point = fend + 1;
-
- /* Discard all the folios that overlap the destination region. */
- truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
i_size_read(target_inode), 0);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index ca2bd204bcc5..61ded59b858f 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -106,7 +106,6 @@ extern int cifs_flush(struct file *, fl_owner_t id);
extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma);
extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct file_operations cifs_dir_ops;
-extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
/* Functions related to dir entries */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index f6d1f075987f..a71a988a92f9 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -202,10 +202,10 @@ struct cifs_cred {
int gid;
int mode;
int cecount;
- struct cifs_sid osid;
- struct cifs_sid gsid;
+ struct smb_sid osid;
+ struct smb_sid gsid;
struct cifs_ntace *ntaces;
- struct cifs_ace *aces;
+ struct smb_ace *aces;
};
struct cifs_open_info_data {
@@ -231,8 +231,8 @@ struct cifs_open_info_data {
unsigned int eas_len;
} wsl;
char *symlink_target;
- struct cifs_sid posix_owner;
- struct cifs_sid posix_group;
+ struct smb_sid posix_owner;
+ struct smb_sid posix_group;
union {
struct smb2_file_all_info fi;
struct smb311_posix_qinfo posix_fi;
@@ -254,9 +254,8 @@ struct cifs_open_info_data {
struct smb_rqst {
struct kvec *rq_iov; /* array of kvecs */
unsigned int rq_nvec; /* number of kvecs in array */
- size_t rq_iter_size; /* Amount of data in ->rq_iter */
struct iov_iter rq_iter; /* Data iterator */
- struct xarray rq_buffer; /* Page buffer for encryption */
+ struct folio_queue *rq_buffer; /* Buffer for encryption */
};
struct mid_q_entry;
@@ -345,7 +344,7 @@ struct smb_version_operations {
/* connect to a server share */
int (*tree_connect)(const unsigned int, struct cifs_ses *, const char *,
struct cifs_tcon *, const struct nls_table *);
- /* close tree connecion */
+ /* close tree connection */
int (*tree_disconnect)(const unsigned int, struct cifs_tcon *);
/* get DFS referrals */
int (*get_dfs_refer)(const unsigned int, struct cifs_ses *,
@@ -537,12 +536,12 @@ struct smb_version_operations {
int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
const char *, const void *, const __u16,
const struct nls_table *, struct cifs_sb_info *);
- struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
- const char *, u32 *, u32);
- struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *,
- const struct cifs_fid *, u32 *, u32);
- int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
- int);
+ struct smb_ntsd * (*get_acl)(struct cifs_sb_info *cifssb, struct inode *ino,
+ const char *patch, u32 *plen, u32 info);
+ struct smb_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *cifssmb,
+ const struct cifs_fid *pfid, u32 *plen, u32 info);
+ int (*set_acl)(struct smb_ntsd *pntsd, __u32 len, struct inode *ino, const char *path,
+ int flag);
/* writepages retry size */
unsigned int (*wp_retry_size)(struct inode *);
/* get mtu credits */
@@ -556,7 +555,7 @@ struct smb_version_operations {
bool (*dir_needs_close)(struct cifsFileInfo *);
long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
loff_t);
- /* init transform request - used for encryption for now */
+ /* init transform (compress/encrypt) request */
int (*init_transform_rq)(struct TCP_Server_Info *, int num_rqst,
struct smb_rqst *, struct smb_rqst *);
int (*is_transform_hdr)(void *buf);
@@ -816,7 +815,7 @@ struct TCP_Server_Info {
* Protected by @refpath_lock and @srv_lock. The @refpath_lock is
* mostly used for not requiring a copy of @leaf_fullpath when getting
* cached or new DFS referrals (which might also sleep during I/O).
- * While @srv_lock is held for making string and NULL comparions against
+ * While @srv_lock is held for making string and NULL comparisons against
* both fields as in mount(2) and cache refresh.
*
* format: \\HOST\SHARE[\OPTIONAL PATH]
@@ -1550,7 +1549,6 @@ struct cifsInodeInfo {
#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */
#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */
#define CIFS_INO_LOCK (5) /* lock bit for synchronization */
-#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */
#define CIFS_INO_CLOSE_ON_LOCK (7) /* Not to defer the close when lock is set */
unsigned long flags;
spinlock_t writers_lock;
@@ -1876,12 +1874,13 @@ static inline bool is_replayable_error(int error)
#define CIFS_HAS_CREDITS 0x0400 /* already has credits */
#define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */
#define CIFS_NO_SRV_RSP 0x1000 /* there is no server response */
+#define CIFS_COMPRESS_REQ 0x4000 /* compress request before sending */
/* Security Flags: indicate type of session setup needed */
#define CIFSSEC_MAY_SIGN 0x00001
#define CIFSSEC_MAY_NTLMV2 0x00004
#define CIFSSEC_MAY_KRB5 0x00008
-#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */
+#define CIFSSEC_MAY_SEAL 0x00040
#define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */
#define CIFSSEC_MUST_SIGN 0x01001
@@ -1891,11 +1890,11 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_NTLMV2 0x04004
#define CIFSSEC_MUST_KRB5 0x08008
#ifdef CONFIG_CIFS_UPCALL
-#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
+#define CIFSSEC_MASK 0xCF0CF /* flags supported if no weak allowed */
#else
-#define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */
+#define CIFSSEC_MASK 0xC70C7 /* flags supported if no weak allowed */
#endif /* UPCALL */
-#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
+#define CIFSSEC_MUST_SEAL 0x40040
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP | CIFSSEC_MAY_SEAL)
@@ -2017,9 +2016,9 @@ extern spinlock_t cifs_tcp_ses_lock;
/*
* Global transaction id (XID) information
*/
-extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */
-extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
-extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */
+extern unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */
+extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */
+extern unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */
extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
/*
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index a2072ab9e586..c3b6263060b0 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -2573,12 +2573,6 @@ typedef struct {
} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */
-struct win_dev {
- unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO or LnxSOCK */
- __le64 major;
- __le64 minor;
-} __attribute__((packed));
-
struct fea {
unsigned char EA_flags;
__u8 name_len;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 497bf3c447bc..c69e3f48a60c 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -225,7 +225,7 @@ extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs,
extern int cifs_rename_pending_delete(const char *full_path,
struct dentry *dentry,
const unsigned int xid);
-extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid,
struct cifs_fattr *fattr, uint sidtype);
extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
@@ -233,19 +233,19 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
const char *path, const struct cifs_fid *pfid);
extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
kuid_t uid, kgid_t gid);
-extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
- const char *, u32 *, u32);
-extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
- const struct cifs_fid *, u32 *, u32);
+extern struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifssmb, struct inode *ino,
+ const char *path, u32 *plen, u32 info);
+extern struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifssb,
+ const struct cifs_fid *pfid, u32 *plen, u32 info);
extern struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap,
struct dentry *dentry, int type);
extern int cifs_set_acl(struct mnt_idmap *idmap,
struct dentry *dentry, struct posix_acl *acl, int type);
-extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
- const char *, int);
-extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
-extern unsigned int setup_special_mode_ACE(struct cifs_ace *pace, __u64 nmode);
-extern unsigned int setup_special_user_owner_ACE(struct cifs_ace *pace);
+extern int set_cifs_acl(struct smb_ntsd *pntsd, __u32 len, struct inode *ino,
+ const char *path, int flag);
+extern unsigned int setup_authusers_ACE(struct smb_ace *pace);
+extern unsigned int setup_special_mode_ACE(struct smb_ace *pace, __u64 nmode);
+extern unsigned int setup_special_user_owner_ACE(struct smb_ace *pace);
extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
@@ -570,9 +570,9 @@ extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage,
struct cifs_sb_info *cifs_sb);
extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
- __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
+ __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen);
extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
- struct cifs_ntsd *, __u32, int);
+ struct smb_ntsd *pntsd, __u32 len, int aclflag);
extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
const unsigned char *searchName,
struct posix_acl **acl, const int acl_type,
@@ -676,6 +676,10 @@ char *extract_sharename(const char *unc);
int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
bool unicode, struct cifs_open_info_data *data);
+int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname);
int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 595c4b673707..131f20b91c3e 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1076,8 +1076,8 @@ OldOpenRetry:
pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO);
pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags));
pSMB->Mode |= cpu_to_le16(0x40); /* deny none */
- /* set file as system file if special file such
- as fifo and server expecting SFU style and
+ /* set file as system file if special file such as fifo,
+ * socket, char or block and server expecting SFU style and
no Unix extensions */
if (create_options & CREATE_OPTION_SPECIAL)
@@ -1193,8 +1193,8 @@ openRetry:
req->AllocationSize = 0;
/*
- * Set file as system file if special file such as fifo and server
- * expecting SFU style and no Unix extensions.
+ * Set file as system file if special file such as fifo, socket, char
+ * or block and server expecting SFU style and no Unix extensions.
*/
if (create_options & CREATE_OPTION_SPECIAL)
req->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
@@ -1261,16 +1261,30 @@ openRetry:
return rc;
}
+static void cifs_readv_worker(struct work_struct *work)
+{
+ struct cifs_io_subrequest *rdata =
+ container_of(work, struct cifs_io_subrequest, subreq.work);
+
+ netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false);
+}
+
static void
cifs_readv_callback(struct mid_q_entry *mid)
{
struct cifs_io_subrequest *rdata = mid->callback_data;
+ struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode);
struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
.rq_iter = rdata->subreq.io_iter };
- struct cifs_credits credits = { .value = 1, .instance = 0 };
+ struct cifs_credits credits = {
+ .value = 1,
+ .instance = 0,
+ .rreq_debug_id = rdata->rreq->debug_id,
+ .rreq_debug_index = rdata->subreq.debug_index,
+ };
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
@@ -1282,6 +1296,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
if (server->sign) {
int rc = 0;
+ iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes);
rc = cifs_verify_signature(&rqst, server,
mid->sequence_number);
if (rc)
@@ -1306,13 +1321,22 @@ cifs_readv_callback(struct mid_q_entry *mid)
rdata->result = -EIO;
}
- if (rdata->result == 0 || rdata->result == -EAGAIN)
- iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
+ if (rdata->result == -ENODATA) {
+ __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+ rdata->result = 0;
+ } else {
+ size_t trans = rdata->subreq.transferred + rdata->got_bytes;
+ if (trans < rdata->subreq.len &&
+ rdata->subreq.start + trans == ictx->remote_i_size) {
+ __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+ rdata->result = 0;
+ }
+ }
+
rdata->credits.value = 0;
- netfs_subreq_terminated(&rdata->subreq,
- (rdata->result == 0 || rdata->result == -EAGAIN) ?
- rdata->got_bytes : rdata->result,
- false);
+ rdata->subreq.transferred += rdata->got_bytes;
+ INIT_WORK(&rdata->subreq.work, cifs_readv_worker);
+ queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -1619,9 +1643,15 @@ static void
cifs_writev_callback(struct mid_q_entry *mid)
{
struct cifs_io_subrequest *wdata = mid->callback_data;
+ struct TCP_Server_Info *server = wdata->server;
struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
- struct cifs_credits credits = { .value = 1, .instance = 0 };
+ struct cifs_credits credits = {
+ .value = 1,
+ .instance = 0,
+ .rreq_debug_id = wdata->rreq->debug_id,
+ .rreq_debug_index = wdata->subreq.debug_index,
+ };
ssize_t result;
size_t written;
@@ -1657,9 +1687,16 @@ cifs_writev_callback(struct mid_q_entry *mid)
break;
}
+ trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index,
+ wdata->credits.value,
+ server->credits, server->in_flight,
+ 0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
cifs_write_subrequest_terminated(wdata, result, true);
release_mid(mid);
+ trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0,
+ server->credits, server->in_flight,
+ credits.value, cifs_trace_rw_credits_write_response_add);
add_credits(tcon->ses->server, &credits, 0);
}
@@ -1713,7 +1750,6 @@ cifs_async_writev(struct cifs_io_subrequest *wdata)
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
rqst.rq_iter = wdata->subreq.io_iter;
- rqst.rq_iter_size = iov_iter_count(&wdata->subreq.io_iter);
cifs_dbg(FYI, "async write at %llu %zu bytes\n",
wdata->subreq.start, wdata->subreq.len);
@@ -3391,7 +3427,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
/* Get Security Descriptor (by handle) from remote server for a file or dir */
int
CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
- struct cifs_ntsd **acl_inf, __u32 *pbuflen)
+ struct smb_ntsd **acl_inf, __u32 *pbuflen)
{
int rc = 0;
int buf_type = 0;
@@ -3461,7 +3497,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
/* check if buffer is big enough for the acl
header followed by the smallest SID */
- if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
+ if ((*pbuflen < sizeof(struct smb_ntsd) + 8) ||
(*pbuflen >= 64 * 1024)) {
cifs_dbg(VFS, "bad acl length %d\n", *pbuflen);
rc = -EINVAL;
@@ -3481,7 +3517,7 @@ qsec_out:
int
CIFSSMBSetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
- struct cifs_ntsd *pntsd, __u32 acllen, int aclflag)
+ struct smb_ntsd *pntsd, __u32 acllen, int aclflag)
{
__u16 byte_count, param_count, data_count, param_offset, data_offset;
int rc = 0;
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c
new file mode 100644
index 000000000000..63b5a55b7a57
--- /dev/null
+++ b/fs/smb/client/compress.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024, SUSE LLC
+ *
+ * Authors: Enzo Matsumiya <ematsumiya@suse.de>
+ *
+ * This file implements I/O compression support for SMB2 messages (SMB 3.1.1 only).
+ * See compress/ for implementation details of each algorithm.
+ *
+ * References:
+ * MS-SMB2 "3.1.4.4 Compressing the Message"
+ * MS-SMB2 "3.1.5.3 Decompressing the Chained Message"
+ * MS-XCA - for details of the supported algorithms
+ */
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/uio.h>
+#include <linux/sort.h>
+
+#include "cifsglob.h"
+#include "../common/smb2pdu.h"
+#include "cifsproto.h"
+#include "smb2proto.h"
+
+#include "compress/lz77.h"
+#include "compress.h"
+
+/*
+ * The heuristic_*() functions below try to determine data compressibility.
+ *
+ * Derived from fs/btrfs/compression.c, changing coding style, some parameters, and removing
+ * unused parts.
+ *
+ * Read that file for better and more detailed explanation of the calculations.
+ *
+ * The algorithms are ran in a collected sample of the input (uncompressed) data.
+ * The sample is formed of 2K reads in PAGE_SIZE intervals, with a maximum size of 4M.
+ *
+ * Parsing the sample goes from "low-hanging fruits" (fastest algorithms, likely compressible)
+ * to "need more analysis" (likely uncompressible).
+ */
+
+struct bucket {
+ unsigned int count;
+};
+
+/**
+ * has_low_entropy() - Compute Shannon entropy of the sampled data.
+ * @bkt: Bytes counts of the sample.
+ * @slen: Size of the sample.
+ *
+ * Return: true if the level (percentage of number of bits that would be required to
+ * compress the data) is below the minimum threshold.
+ *
+ * Note:
+ * There _is_ an entropy level here that's > 65 (minimum threshold) that would indicate a
+ * possibility of compression, but compressing, or even further analysing, it would waste so much
+ * resources that it's simply not worth it.
+ *
+ * Also Shannon entropy is the last computed heuristic; if we got this far and ended up
+ * with uncertainty, just stay on the safe side and call it uncompressible.
+ */
+static bool has_low_entropy(struct bucket *bkt, size_t slen)
+{
+ const size_t threshold = 65, max_entropy = 8 * ilog2(16);
+ size_t i, p, p2, len, sum = 0;
+
+#define pow4(n) (n * n * n * n)
+ len = ilog2(pow4(slen));
+
+ for (i = 0; i < 256 && bkt[i].count > 0; i++) {
+ p = bkt[i].count;
+ p2 = ilog2(pow4(p));
+ sum += p * (len - p2);
+ }
+
+ sum /= slen;
+
+ return ((sum * 100 / max_entropy) <= threshold);
+}
+
+#define BYTE_DIST_BAD 0
+#define BYTE_DIST_GOOD 1
+#define BYTE_DIST_MAYBE 2
+/**
+ * calc_byte_distribution() - Compute byte distribution on the sampled data.
+ * @bkt: Byte counts of the sample.
+ * @slen: Size of the sample.
+ *
+ * Return:
+ * BYTE_DIST_BAD: A "hard no" for compression -- a computed uniform distribution of
+ * the bytes (e.g. random or encrypted data).
+ * BYTE_DIST_GOOD: High probability (normal (Gaussian) distribution) of the data being
+ * compressible.
+ * BYTE_DIST_MAYBE: When computed byte distribution resulted in "low > n < high"
+ * grounds. has_low_entropy() should be used for a final decision.
+ */
+static int calc_byte_distribution(struct bucket *bkt, size_t slen)
+{
+ const size_t low = 64, high = 200, threshold = slen * 90 / 100;
+ size_t sum = 0;
+ int i;
+
+ for (i = 0; i < low; i++)
+ sum += bkt[i].count;
+
+ if (sum > threshold)
+ return BYTE_DIST_BAD;
+
+ for (; i < high && bkt[i].count > 0; i++) {
+ sum += bkt[i].count;
+ if (sum > threshold)
+ break;
+ }
+
+ if (i <= low)
+ return BYTE_DIST_GOOD;
+
+ if (i >= high)
+ return BYTE_DIST_BAD;
+
+ return BYTE_DIST_MAYBE;
+}
+
+static bool is_mostly_ascii(const struct bucket *bkt)
+{
+ size_t count = 0;
+ int i;
+
+ for (i = 0; i < 256; i++)
+ if (bkt[i].count > 0)
+ /* Too many non-ASCII (0-63) bytes. */
+ if (++count > 64)
+ return false;
+
+ return true;
+}
+
+static bool has_repeated_data(const u8 *sample, size_t len)
+{
+ size_t s = len / 2;
+
+ return (!memcmp(&sample[0], &sample[s], s));
+}
+
+static int cmp_bkt(const void *_a, const void *_b)
+{
+ const struct bucket *a = _a, *b = _b;
+
+ /* Reverse sort. */
+ if (a->count > b->count)
+ return -1;
+
+ return 1;
+}
+
+/*
+ * TODO:
+ * Support other iter types, if required.
+ * Only ITER_XARRAY is supported for now.
+ */
+static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
+{
+ struct folio *folios[16], *folio;
+ unsigned int nr, i, j, npages;
+ loff_t start = iter->xarray_start + iter->iov_offset;
+ pgoff_t last, index = start / PAGE_SIZE;
+ size_t len, off, foff;
+ ssize_t ret = 0;
+ void *p;
+ int s = 0;
+
+ last = (start + max - 1) / PAGE_SIZE;
+ do {
+ nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios),
+ XA_PRESENT);
+ if (nr == 0)
+ return -EIO;
+
+ for (i = 0; i < nr; i++) {
+ folio = folios[i];
+ npages = folio_nr_pages(folio);
+ foff = start - folio_pos(folio);
+ off = foff % PAGE_SIZE;
+
+ for (j = foff / PAGE_SIZE; j < npages; j++) {
+ size_t len2;
+
+ len = min_t(size_t, max, PAGE_SIZE - off);
+ len2 = min_t(size_t, len, SZ_2K);
+
+ p = kmap_local_page(folio_page(folio, j));
+ memcpy(&sample[s], p, len2);
+ kunmap_local(p);
+
+ if (ret < 0)
+ return ret;
+
+ s += len2;
+
+ if (len2 < SZ_2K || s >= max - SZ_2K)
+ return s;
+
+ max -= len;
+ if (max <= 0)
+ return s;
+
+ start += len;
+ off = 0;
+ index++;
+ }
+ }
+ } while (nr == ARRAY_SIZE(folios));
+
+ return s;
+}
+
+/**
+ * is_compressible() - Determines if a chunk of data is compressible.
+ * @data: Iterator containing uncompressed data.
+ *
+ * Return: true if @data is compressible, false otherwise.
+ *
+ * Tests shows that this function is quite reliable in predicting data compressibility,
+ * matching close to 1:1 with the behaviour of LZ77 compression success and failures.
+ */
+static bool is_compressible(const struct iov_iter *data)
+{
+ const size_t read_size = SZ_2K, bkt_size = 256, max = SZ_4M;
+ struct bucket *bkt = NULL;
+ size_t len;
+ u8 *sample;
+ bool ret = false;
+ int i;
+
+ /* Preventive double check -- already checked in should_compress(). */
+ len = iov_iter_count(data);
+ if (unlikely(len < read_size))
+ return ret;
+
+ if (len - read_size > max)
+ len = max;
+
+ sample = kvzalloc(len, GFP_KERNEL);
+ if (!sample) {
+ WARN_ON_ONCE(1);
+
+ return ret;
+ }
+
+ /* Sample 2K bytes per page of the uncompressed data. */
+ i = collect_sample(data, len, sample);
+ if (i <= 0) {
+ WARN_ON_ONCE(1);
+
+ goto out;
+ }
+
+ len = i;
+ ret = true;
+
+ if (has_repeated_data(sample, len))
+ goto out;
+
+ bkt = kcalloc(bkt_size, sizeof(*bkt), GFP_KERNEL);
+ if (!bkt) {
+ WARN_ON_ONCE(1);
+ ret = false;
+
+ goto out;
+ }
+
+ for (i = 0; i < len; i++)
+ bkt[sample[i]].count++;
+
+ if (is_mostly_ascii(bkt))
+ goto out;
+
+ /* Sort in descending order */
+ sort(bkt, bkt_size, sizeof(*bkt), cmp_bkt, NULL);
+
+ i = calc_byte_distribution(bkt, len);
+ if (i != BYTE_DIST_MAYBE) {
+ ret = !!i;
+
+ goto out;
+ }
+
+ ret = has_low_entropy(bkt, len);
+out:
+ kvfree(sample);
+ kfree(bkt);
+
+ return ret;
+}
+
+bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq)
+{
+ const struct smb2_hdr *shdr = rq->rq_iov->iov_base;
+
+ if (unlikely(!tcon || !tcon->ses || !tcon->ses->server))
+ return false;
+
+ if (!tcon->ses->server->compression.enabled)
+ return false;
+
+ if (!(tcon->share_flags & SMB2_SHAREFLAG_COMPRESS_DATA))
+ return false;
+
+ if (shdr->Command == SMB2_WRITE) {
+ const struct smb2_write_req *wreq = rq->rq_iov->iov_base;
+
+ if (le32_to_cpu(wreq->Length) < SMB_COMPRESS_MIN_LEN)
+ return false;
+
+ return is_compressible(&rq->rq_iter);
+ }
+
+ return (shdr->Command == SMB2_READ);
+}
+
+int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn)
+{
+ struct iov_iter iter;
+ u32 slen, dlen;
+ void *src, *dst = NULL;
+ int ret;
+
+ if (!server || !rq || !rq->rq_iov || !rq->rq_iov->iov_base)
+ return -EINVAL;
+
+ if (rq->rq_iov->iov_len != sizeof(struct smb2_write_req))
+ return -EINVAL;
+
+ slen = iov_iter_count(&rq->rq_iter);
+ src = kvzalloc(slen, GFP_KERNEL);
+ if (!src) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ /* Keep the original iter intact. */
+ iter = rq->rq_iter;
+
+ if (!copy_from_iter_full(src, slen, &iter)) {
+ ret = -EIO;
+ goto err_free;
+ }
+
+ /*
+ * This is just overprovisioning, as the algorithm will error out if @dst reaches 7/8
+ * of @slen.
+ */
+ dlen = slen;
+ dst = kvzalloc(dlen, GFP_KERNEL);
+ if (!dst) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ ret = lz77_compress(src, slen, dst, &dlen);
+ if (!ret) {
+ struct smb2_compression_hdr hdr = { 0 };
+ struct smb_rqst comp_rq = { .rq_nvec = 3, };
+ struct kvec iov[3];
+
+ hdr.ProtocolId = SMB2_COMPRESSION_TRANSFORM_ID;
+ hdr.OriginalCompressedSegmentSize = cpu_to_le32(slen);
+ hdr.CompressionAlgorithm = SMB3_COMPRESS_LZ77;
+ hdr.Flags = SMB2_COMPRESSION_FLAG_NONE;
+ hdr.Offset = cpu_to_le32(rq->rq_iov[0].iov_len);
+
+ iov[0].iov_base = &hdr;
+ iov[0].iov_len = sizeof(hdr);
+ iov[1] = rq->rq_iov[0];
+ iov[2].iov_base = dst;
+ iov[2].iov_len = dlen;
+
+ comp_rq.rq_iov = iov;
+
+ ret = send_fn(server, 1, &comp_rq);
+ } else if (ret == -EMSGSIZE || dlen >= slen) {
+ ret = send_fn(server, 1, rq);
+ }
+err_free:
+ kvfree(dst);
+ kvfree(src);
+
+ return ret;
+}
diff --git a/fs/smb/client/compress.h b/fs/smb/client/compress.h
new file mode 100644
index 000000000000..f3ed1d3e52fb
--- /dev/null
+++ b/fs/smb/client/compress.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024, SUSE LLC
+ *
+ * Authors: Enzo Matsumiya <ematsumiya@suse.de>
+ *
+ * This file implements I/O compression support for SMB2 messages (SMB 3.1.1 only).
+ * See compress/ for implementation details of each algorithm.
+ *
+ * References:
+ * MS-SMB2 "3.1.4.4 Compressing the Message" - for compression details
+ * MS-SMB2 "3.1.5.3 Decompressing the Chained Message" - for decompression details
+ * MS-XCA - for details of the supported algorithms
+ */
+#ifndef _SMB_COMPRESS_H
+#define _SMB_COMPRESS_H
+
+#include <linux/uio.h>
+#include <linux/kernel.h>
+#include "../common/smb2pdu.h"
+#include "cifsglob.h"
+
+/* sizeof(smb2_compression_hdr) - sizeof(OriginalPayloadSize) */
+#define SMB_COMPRESS_HDR_LEN 16
+/* sizeof(smb2_compression_payload_hdr) - sizeof(OriginalPayloadSize) */
+#define SMB_COMPRESS_PAYLOAD_HDR_LEN 8
+#define SMB_COMPRESS_MIN_LEN PAGE_SIZE
+
+#ifdef CONFIG_CIFS_COMPRESSION
+typedef int (*compress_send_fn)(struct TCP_Server_Info *, int, struct smb_rqst *);
+
+int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn);
+
+/**
+ * should_compress() - Determines if a request (write) or the response to a
+ * request (read) should be compressed.
+ * @tcon: tcon of the request is being sent to
+ * @rqst: request to evaluate
+ *
+ * Return: true iff:
+ * - compression was successfully negotiated with server
+ * - server has enabled compression for the share
+ * - it's a read or write request
+ * - (write only) request length is >= SMB_COMPRESS_MIN_LEN
+ * - (write only) is_compressible() returns 1
+ *
+ * Return false otherwise.
+ */
+bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq);
+
+/**
+ * smb_compress_alg_valid() - Validate a compression algorithm.
+ * @alg: Compression algorithm to check.
+ * @valid_none: Conditional check whether NONE algorithm should be
+ * considered valid or not.
+ *
+ * If @alg is SMB3_COMPRESS_NONE, this function returns @valid_none.
+ *
+ * Note that 'NONE' (0) compressor type is considered invalid in protocol
+ * negotiation, as it's never requested to/returned from the server.
+ *
+ * Return: true if @alg is valid/supported, false otherwise.
+ */
+static __always_inline int smb_compress_alg_valid(__le16 alg, bool valid_none)
+{
+ if (alg == SMB3_COMPRESS_NONE)
+ return valid_none;
+
+ if (alg == SMB3_COMPRESS_LZ77 || alg == SMB3_COMPRESS_PATTERN)
+ return true;
+
+ return false;
+}
+#else /* !CONFIG_CIFS_COMPRESSION */
+static inline int smb_compress(void *unused1, void *unused2, void *unused3)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool should_compress(void *unused1, void *unused2)
+{
+ return false;
+}
+
+static inline int smb_compress_alg_valid(__le16 unused1, bool unused2)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* !CONFIG_CIFS_COMPRESSION */
+#endif /* _SMB_COMPRESS_H */
diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c
new file mode 100644
index 000000000000..553e253ada29
--- /dev/null
+++ b/fs/smb/client/compress/lz77.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024, SUSE LLC
+ *
+ * Authors: Enzo Matsumiya <ematsumiya@suse.de>
+ *
+ * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec.
+ */
+#include <linux/slab.h>
+#include <linux/sizes.h>
+#include <linux/count_zeros.h>
+#include <asm/unaligned.h>
+
+#include "lz77.h"
+
+/*
+ * Compression parameters.
+ */
+#define LZ77_MATCH_MIN_LEN 4
+#define LZ77_MATCH_MIN_DIST 1
+#define LZ77_MATCH_MAX_DIST SZ_1K
+#define LZ77_HASH_LOG 15
+#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG)
+#define LZ77_STEP_SIZE sizeof(u64)
+
+static __always_inline u8 lz77_read8(const u8 *ptr)
+{
+ return get_unaligned(ptr);
+}
+
+static __always_inline u64 lz77_read64(const u64 *ptr)
+{
+ return get_unaligned(ptr);
+}
+
+static __always_inline void lz77_write8(u8 *ptr, u8 v)
+{
+ put_unaligned(v, ptr);
+}
+
+static __always_inline void lz77_write16(u16 *ptr, u16 v)
+{
+ put_unaligned_le16(v, ptr);
+}
+
+static __always_inline void lz77_write32(u32 *ptr, u32 v)
+{
+ put_unaligned_le32(v, ptr);
+}
+
+static __always_inline u32 lz77_match_len(const void *wnd, const void *cur, const void *end)
+{
+ const void *start = cur;
+ u64 diff;
+
+ /* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
+ do {
+ diff = lz77_read64(cur) ^ lz77_read64(wnd);
+ if (!diff) {
+ cur += LZ77_STEP_SIZE;
+ wnd += LZ77_STEP_SIZE;
+
+ continue;
+ }
+
+ /* This computes the number of common bytes in @diff. */
+ cur += count_trailing_zeros(diff) >> 3;
+
+ return (cur - start);
+ } while (likely(cur + LZ77_STEP_SIZE < end));
+
+ while (cur < end && lz77_read8(cur++) == lz77_read8(wnd++))
+ ;
+
+ return (cur - start);
+}
+
+static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u32 len)
+{
+ len -= 3;
+ dist--;
+ dist <<= 3;
+
+ if (len < 7) {
+ lz77_write16(dst, dist + len);
+
+ return dst + 2;
+ }
+
+ dist |= 7;
+ lz77_write16(dst, dist);
+ dst += 2;
+ len -= 7;
+
+ if (!*nib) {
+ lz77_write8(dst, umin(len, 15));
+ *nib = dst;
+ dst++;
+ } else {
+ u8 *b = *nib;
+
+ lz77_write8(b, *b | umin(len, 15) << 4);
+ *nib = NULL;
+ }
+
+ if (len < 15)
+ return dst;
+
+ len -= 15;
+ if (len < 255) {
+ lz77_write8(dst, len);
+
+ return dst + 1;
+ }
+
+ lz77_write8(dst, 0xff);
+ dst++;
+ len += 7 + 15;
+ if (len <= 0xffff) {
+ lz77_write16(dst, len);
+
+ return dst + 2;
+ }
+
+ lz77_write16(dst, 0);
+ dst += 2;
+ lz77_write32(dst, len);
+
+ return dst + 4;
+}
+
+noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen)
+{
+ const void *srcp, *end;
+ void *dstp, *nib, *flag_pos;
+ u32 flag_count = 0;
+ long flag = 0;
+ u64 *htable;
+
+ srcp = src;
+ end = src + slen;
+ dstp = dst;
+ nib = NULL;
+ flag_pos = dstp;
+ dstp += 4;
+
+ htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL);
+ if (!htable)
+ return -ENOMEM;
+
+ /* Main loop. */
+ do {
+ u32 dist, len = 0;
+ const void *wnd;
+ u64 hash;
+
+ hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG);
+ wnd = src + htable[hash];
+ htable[hash] = srcp - src;
+ dist = srcp - wnd;
+
+ if (dist && dist < LZ77_MATCH_MAX_DIST)
+ len = lz77_match_len(wnd, srcp, end);
+
+ if (len < LZ77_MATCH_MIN_LEN) {
+ lz77_write8(dstp, lz77_read8(srcp));
+
+ dstp++;
+ srcp++;
+
+ flag <<= 1;
+ flag_count++;
+ if (flag_count == 32) {
+ lz77_write32(flag_pos, flag);
+ flag_count = 0;
+ flag_pos = dstp;
+ dstp += 4;
+ }
+
+ continue;
+ }
+
+ /*
+ * Bail out if @dstp reached >= 7/8 of @slen -- already compressed badly, not worth
+ * going further.
+ */
+ if (unlikely(dstp - dst >= slen - (slen >> 3))) {
+ *dlen = slen;
+ goto out;
+ }
+
+ dstp = lz77_write_match(dstp, &nib, dist, len);
+ srcp += len;
+
+ flag = (flag << 1) | 1;
+ flag_count++;
+ if (flag_count == 32) {
+ lz77_write32(flag_pos, flag);
+ flag_count = 0;
+ flag_pos = dstp;
+ dstp += 4;
+ }
+ } while (likely(srcp + LZ77_STEP_SIZE < end));
+
+ while (srcp < end) {
+ u32 c = umin(end - srcp, 32 - flag_count);
+
+ memcpy(dstp, srcp, c);
+
+ dstp += c;
+ srcp += c;
+
+ flag <<= c;
+ flag_count += c;
+ if (flag_count == 32) {
+ lz77_write32(flag_pos, flag);
+ flag_count = 0;
+ flag_pos = dstp;
+ dstp += 4;
+ }
+ }
+
+ flag <<= (32 - flag_count);
+ flag |= (1 << (32 - flag_count)) - 1;
+ lz77_write32(flag_pos, flag);
+
+ *dlen = dstp - dst;
+out:
+ kvfree(htable);
+
+ if (*dlen < slen)
+ return 0;
+
+ return -EMSGSIZE;
+}
diff --git a/fs/smb/client/compress/lz77.h b/fs/smb/client/compress/lz77.h
new file mode 100644
index 000000000000..cdcb191b48a2
--- /dev/null
+++ b/fs/smb/client/compress/lz77.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024, SUSE LLC
+ *
+ * Authors: Enzo Matsumiya <ematsumiya@suse.de>
+ *
+ * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec.
+ */
+#ifndef _SMB_COMPRESS_LZ77_H
+#define _SMB_COMPRESS_LZ77_H
+
+#include <linux/kernel.h>
+
+int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen);
+#endif /* _SMB_COMPRESS_LZ77_H */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index d2307162a2de..08a41c7aaf72 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -657,6 +657,19 @@ static bool
server_unresponsive(struct TCP_Server_Info *server)
{
/*
+ * If we're in the process of mounting a share or reconnecting a session
+ * and the server abruptly shut down (e.g. socket wasn't closed, packet
+ * had been ACK'ed but no SMB response), don't wait longer than 20s to
+ * negotiate protocol.
+ */
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus == CifsInNegotiate &&
+ time_after(jiffies, server->lstrp + 20 * HZ)) {
+ spin_unlock(&server->srv_lock);
+ cifs_reconnect(server, false);
+ return true;
+ }
+ /*
* We need to wait 3 echo intervals to make sure we handle such
* situations right:
* 1s client sends a normal SMB request
@@ -667,7 +680,6 @@ server_unresponsive(struct TCP_Server_Info *server)
* 65s kernel_recvmsg times out, and we see that we haven't gotten
* a response in >60s.
*/
- spin_lock(&server->srv_lock);
if ((server->tcpStatus == CifsGood ||
server->tcpStatus == CifsNeedNegotiate) &&
(!server->ops->can_echo || server->ops->can_echo(server)) &&
@@ -997,11 +1009,10 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
}
if (!list_empty(&server->pending_mid_q)) {
- struct list_head dispose_list;
struct mid_q_entry *mid_entry;
struct list_head *tmp, *tmp2;
+ LIST_HEAD(dispose_list);
- INIT_LIST_HEAD(&dispose_list);
spin_lock(&server->mid_lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
@@ -4069,7 +4080,7 @@ __cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
ses = cifs_get_smb_ses(master_tcon->ses->server, ctx);
if (IS_ERR(ses)) {
- tcon = (struct cifs_tcon *)ses;
+ tcon = ERR_CAST(ses);
cifs_put_tcp_session(master_tcon->ses->server, 0);
goto out;
}
@@ -4194,6 +4205,9 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
*
* If one doesn't exist then insert a new tcon_link struct into the tree and
* try to construct a new one.
+ *
+ * REMEMBER to call cifs_put_tlink() after successful calls to cifs_sb_tlink,
+ * to avoid refcount issues
*/
struct tcon_link *
cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index b2405dd4d4d4..78b59c4ef3ce 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -49,6 +49,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
struct cifs_io_subrequest *wdata =
container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = wdata->req;
+ struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr];
struct TCP_Server_Info *server;
struct cifsFileInfo *open_file = req->cfile;
size_t wsize = req->rreq.wsize;
@@ -73,7 +74,7 @@ retry:
}
}
- rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len,
+ rc = server->ops->wait_mtu_credits(server, wsize, &stream->sreq_max_len,
&wdata->credits);
if (rc < 0) {
subreq->error = rc;
@@ -92,7 +93,7 @@ retry:
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->smbd_conn)
- subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+ stream->sreq_max_segs = server->smbd_conn->max_frmr_depth;
#endif
}
@@ -139,25 +140,22 @@ static void cifs_netfs_invalidate_cache(struct netfs_io_request *wreq)
}
/*
- * Split the read up according to how many credits we can get for each piece.
- * It's okay to sleep here if we need to wait for more credit to become
- * available.
- *
- * We also choose the server and allocate an operation ID to be cleaned up
- * later.
+ * Negotiate the size of a read operation on behalf of the netfs library.
*/
-static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
+static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
struct TCP_Server_Info *server = req->server;
struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
- size_t rsize = 0;
- int rc;
+ size_t size;
+ int rc = 0;
- rdata->xid = get_xid();
- rdata->have_xid = true;
+ if (!rdata->have_xid) {
+ rdata->xid = get_xid();
+ rdata->have_xid = true;
+ }
rdata->server = server;
if (cifs_sb->ctx->rsize == 0)
@@ -165,13 +163,12 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink),
cifs_sb->ctx);
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
+ &size, &rdata->credits);
+ if (rc)
+ return rc;
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize,
- &rdata->credits);
- if (rc) {
- subreq->error = rc;
- return false;
- }
+ rreq->io_streams[0].sreq_max_len = size;
rdata->credits.in_flight_check = 1;
rdata->credits.rreq_debug_id = rreq->debug_id;
@@ -183,13 +180,11 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
server->credits, server->in_flight, 0,
cifs_trace_rw_credits_read_submit);
- subreq->len = min_t(size_t, subreq->len, rsize);
-
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->smbd_conn)
- subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+ rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth;
#endif
- return true;
+ return 0;
}
/*
@@ -198,31 +193,41 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
* to only read a portion of that, but as long as we read something, the netfs
* helper will call us again so that we can issue another read.
*/
-static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
+static void cifs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+ struct TCP_Server_Info *server = req->server;
int rc = 0;
cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
__func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
subreq->transferred, subreq->len);
+ rc = adjust_credits(server, rdata, cifs_trace_rw_credits_issue_read_adjust);
+ if (rc)
+ goto failed;
+
if (req->cfile->invalidHandle) {
do {
rc = cifs_reopen_file(req->cfile, true);
} while (rc == -EAGAIN);
if (rc)
- goto out;
+ goto failed;
}
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (subreq->rreq->origin != NETFS_DIO_READ)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
rc = rdata->server->ops->async_readv(rdata);
-out:
if (rc)
- netfs_subreq_terminated(subreq, rc, false);
+ goto failed;
+ return;
+
+failed:
+ netfs_read_subreq_terminated(subreq, rc, false);
}
/*
@@ -286,12 +291,6 @@ static void cifs_rreq_done(struct netfs_io_request *rreq)
inode_set_atime_to_ts(inode, inode_get_mtime(inode));
}
-static void cifs_post_modify(struct inode *inode)
-{
- /* Indication to update ctime and mtime as close is deferred */
- set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
-}
-
static void cifs_free_request(struct netfs_io_request *rreq)
{
struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
@@ -315,7 +314,7 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
#endif
}
- if (rdata->credits.value != 0)
+ if (rdata->credits.value != 0) {
trace_smb3_rw_credits(rdata->rreq->debug_id,
rdata->subreq.debug_index,
rdata->credits.value,
@@ -323,8 +322,12 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
rdata->server ? rdata->server->in_flight : 0,
-rdata->credits.value,
cifs_trace_rw_credits_free_subreq);
+ if (rdata->server)
+ add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
+ else
+ rdata->credits.value = 0;
+ }
- add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
if (rdata->have_xid)
free_xid(rdata->xid);
}
@@ -335,10 +338,9 @@ const struct netfs_request_ops cifs_req_ops = {
.init_request = cifs_init_request,
.free_request = cifs_free_request,
.free_subrequest = cifs_free_subrequest,
- .clamp_length = cifs_clamp_length,
- .issue_read = cifs_req_issue_read,
+ .prepare_read = cifs_prepare_read,
+ .issue_read = cifs_issue_read,
.done = cifs_rreq_done,
- .post_modify = cifs_post_modify,
.begin_writeback = cifs_begin_writeback,
.prepare_write = cifs_prepare_write,
.issue_write = cifs_issue_write,
@@ -1362,7 +1364,7 @@ int cifs_close(struct inode *inode, struct file *file)
dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
if ((cfile->status_file_deleted == false) &&
(smb2_can_defer_close(inode, dclose))) {
- if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
+ if (test_and_clear_bit(NETFS_ICTX_MODIFIED_ATTR, &cinode->netfs.flags)) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
}
@@ -1401,7 +1403,7 @@ void
cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
{
struct cifsFileInfo *open_file, *tmp;
- struct list_head tmp_list;
+ LIST_HEAD(tmp_list);
if (!tcon->use_persistent || !tcon->need_reopen_files)
return;
@@ -1409,7 +1411,6 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
tcon->need_reopen_files = false;
cifs_dbg(FYI, "Reopen persistent handles\n");
- INIT_LIST_HEAD(&tmp_list);
/* list all files open on tree connection, reopen resilient handles */
spin_lock(&tcon->open_file_lock);
@@ -2092,9 +2093,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifsLockInfo *li, *tmp;
__u64 length = cifs_flock_len(flock);
- struct list_head tmp_llist;
-
- INIT_LIST_HEAD(&tmp_llist);
+ LIST_HEAD(tmp_llist);
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
@@ -2749,6 +2748,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file->f_mapping->host;
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
ssize_t rc;
rc = netfs_start_io_write(inode);
@@ -2765,12 +2765,16 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
if (rc <= 0)
goto out;
- if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) &&
+ (cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
server->vals->exclusive_lock_type, 0,
- NULL, CIFS_WRITE_OP))
- rc = netfs_buffered_write_iter_locked(iocb, from, NULL);
- else
+ NULL, CIFS_WRITE_OP))) {
rc = -EACCES;
+ goto out;
+ }
+
+ rc = netfs_buffered_write_iter_locked(iocb, from, NULL);
+
out:
up_read(&cinode->lock_sem);
netfs_end_io_write(inode);
@@ -2902,9 +2906,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
if (!CIFS_CACHE_READ(cinode))
return netfs_unbuffered_read_iter(iocb, to);
- if (cap_unix(tcon->ses) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) {
if (iocb->ki_flags & IOCB_DIRECT)
return netfs_unbuffered_read_iter(iocb, to);
return netfs_buffered_read_iter(iocb, to);
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index bc926ab2555b..28c4e576d460 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -978,9 +978,12 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
switch (opt) {
case Opt_compress:
+ if (!IS_ENABLED(CONFIG_CIFS_COMPRESSION)) {
+ cifs_errorf(fc, "CONFIG_CIFS_COMPRESSION kernel config option is unset\n");
+ goto cifs_parse_mount_err;
+ }
ctx->compress = true;
- cifs_dbg(VFS,
- "SMB3 compression support is experimental\n");
+ cifs_dbg(VFS, "SMB3 compression support is experimental\n");
break;
case Opt_nodfs:
ctx->nodfs = 1;
@@ -1896,14 +1899,17 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb)
if (ctx->mfsymlinks) {
if (ctx->sfu_emul) {
/*
- * Our SFU ("Services for Unix" emulation does not allow
- * creating symlinks but does allow reading existing SFU
- * symlinks (it does allow both creating and reading SFU
- * style mknod and FIFOs though). When "mfsymlinks" and
+ * Our SFU ("Services for Unix") emulation allows now
+ * creating new and reading existing SFU symlinks.
+ * Older Linux kernel versions were not able to neither
+ * read existing nor create new SFU symlinks. But
+ * creating and reading SFU style mknod and FIFOs was
+ * supported for long time. When "mfsymlinks" and
* "sfu" are both enabled at the same time, it allows
* reading both types of symlinks, but will only create
* them with mfsymlinks format. This allows better
- * Apple compatibility (probably better for Samba too)
+ * Apple compatibility, compatibility with older Linux
+ * kernel clients (probably better for Samba too)
* while still recognizing old Windows style symlinks.
*/
cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n");
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index dd0afa23734c..331a86074ae7 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -172,6 +172,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
CIFS_I(inode)->time = 0; /* force reval */
return -ESTALE;
}
+ if (inode->i_state & I_NEW)
+ CIFS_I(inode)->netfs.zero_point = fattr->cf_eof;
cifs_revalidate_cache(inode, fattr);
@@ -527,6 +529,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
struct cifs_fid fid;
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {0};
+ char *symlink_buf_utf16;
+ unsigned int symlink_len_utf16;
char buf[24];
unsigned int bytes_read;
char *pbuf;
@@ -537,10 +541,11 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
fattr->cf_mode &= ~S_IFMT;
if (fattr->cf_eof == 0) {
+ cifs_dbg(FYI, "Fifo\n");
fattr->cf_mode |= S_IFIFO;
fattr->cf_dtype = DT_FIFO;
return 0;
- } else if (fattr->cf_eof < 8) {
+ } else if (fattr->cf_eof > 1 && fattr->cf_eof < 8) {
fattr->cf_mode |= S_IFREG;
fattr->cf_dtype = DT_REG;
return -EINVAL; /* EOPNOTSUPP? */
@@ -582,7 +587,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms,
&bytes_read, &pbuf, &buf_type);
if ((rc == 0) && (bytes_read >= 8)) {
- if (memcmp("IntxBLK", pbuf, 8) == 0) {
+ if (memcmp("IntxBLK\0", pbuf, 8) == 0) {
cifs_dbg(FYI, "Block device\n");
fattr->cf_mode |= S_IFBLK;
fattr->cf_dtype = DT_BLK;
@@ -594,7 +599,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
fattr->cf_rdev = MKDEV(mjr, mnr);
}
- } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+ } else if (memcmp("IntxCHR\0", pbuf, 8) == 0) {
cifs_dbg(FYI, "Char device\n");
fattr->cf_mode |= S_IFCHR;
fattr->cf_dtype = DT_CHR;
@@ -610,10 +615,37 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
cifs_dbg(FYI, "Socket\n");
fattr->cf_mode |= S_IFSOCK;
fattr->cf_dtype = DT_SOCK;
- } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
+ } else if (memcmp("IntxLNK\1", pbuf, 8) == 0) {
cifs_dbg(FYI, "Symlink\n");
fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
+ if ((fattr->cf_eof > 8) && (fattr->cf_eof % 2 == 0)) {
+ symlink_buf_utf16 = kmalloc(fattr->cf_eof-8 + 1, GFP_KERNEL);
+ if (symlink_buf_utf16) {
+ io_parms.offset = 8;
+ io_parms.length = fattr->cf_eof-8 + 1;
+ buf_type = CIFS_NO_BUFFER;
+ rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms,
+ &symlink_len_utf16,
+ &symlink_buf_utf16,
+ &buf_type);
+ if ((rc == 0) &&
+ (symlink_len_utf16 > 0) &&
+ (symlink_len_utf16 < fattr->cf_eof-8 + 1) &&
+ (symlink_len_utf16 % 2 == 0)) {
+ fattr->cf_symlink_target =
+ cifs_strndup_from_utf16(symlink_buf_utf16,
+ symlink_len_utf16,
+ true,
+ cifs_sb->local_nls);
+ if (!fattr->cf_symlink_target)
+ rc = -ENOMEM;
+ }
+ kfree(symlink_buf_utf16);
+ } else {
+ rc = -ENOMEM;
+ }
+ }
} else if (memcmp("LnxFIFO", pbuf, 8) == 0) {
cifs_dbg(FYI, "FIFO\n");
fattr->cf_mode |= S_IFIFO;
@@ -623,6 +655,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
fattr->cf_dtype = DT_REG;
rc = -EOPNOTSUPP;
}
+ } else if ((rc == 0) && (bytes_read == 1) && (pbuf[0] == '\0')) {
+ cifs_dbg(FYI, "Socket\n");
+ fattr->cf_mode |= S_IFSOCK;
+ fattr->cf_dtype = DT_SOCK;
} else {
fattr->cf_mode |= S_IFREG; /* then it is a file */
fattr->cf_dtype = DT_REG;
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 44dbaf9929a4..9bb5c869f4db 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -229,9 +229,11 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg)
shutdown_good:
trace_smb3_shutdown_done(flags, tcon->tid);
+ cifs_put_tlink(tlink);
return 0;
shutdown_out_err:
trace_smb3_shutdown_err(rc, flags, tcon->tid);
+ cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index d86da949a919..47ddeb7fa111 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -588,6 +588,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
rc = PTR_ERR(tlink);
+ /* BB could be clearer if skipped put_tlink on error here, but harmless */
goto symlink_exit;
}
pTcon = tlink_tcon(tlink);
@@ -605,6 +606,9 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
/* BB what if DFS and this volume is on different share? BB */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
+ } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon,
+ full_path, S_IFLNK, 0, symname);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
} else if (pTcon->unix_ext) {
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index b28ff62f1f15..dab526191b07 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -352,7 +352,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
* on simple responses (wct, bcc both zero)
* in particular have seen this on
* ulogoffX and FindClose. This leaves
- * one byte of bcc potentially unitialized
+ * one byte of bcc potentially uninitialized
*/
/* zero rest of bcc */
tmp[sizeof(struct smb_hdr)+1] = 0;
@@ -751,12 +751,11 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *cfile = NULL;
struct file_list *tmp_list, *tmp_next_list;
- struct list_head file_head;
+ LIST_HEAD(file_head);
if (cifs_inode == NULL)
return;
- INIT_LIST_HEAD(&file_head);
spin_lock(&cifs_inode->open_file_lock);
list_for_each_entry(cfile, &cifs_inode->openFileList, flist) {
if (delayed_work_pending(&cfile->deferred)) {
@@ -787,9 +786,8 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon)
{
struct cifsFileInfo *cfile;
struct file_list *tmp_list, *tmp_next_list;
- struct list_head file_head;
+ LIST_HEAD(file_head);
- INIT_LIST_HEAD(&file_head);
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
if (delayed_work_pending(&cfile->deferred)) {
@@ -819,11 +817,10 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
{
struct cifsFileInfo *cfile;
struct file_list *tmp_list, *tmp_next_list;
- struct list_head file_head;
void *page;
const char *full_path;
+ LIST_HEAD(file_head);
- INIT_LIST_HEAD(&file_head);
page = alloc_dentry_path();
spin_lock(&tcon->open_file_lock);
list_for_each_entry(cfile, &tcon->openFileList, tlist) {
@@ -1234,6 +1231,7 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid,
const char *full_path,
bool *islink)
{
+ struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_ses *ses = tcon->ses;
size_t len;
char *path;
@@ -1250,12 +1248,12 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid,
!is_tcon_dfs(tcon))
return 0;
- spin_lock(&tcon->tc_lock);
- if (!tcon->origin_fullpath) {
- spin_unlock(&tcon->tc_lock);
+ spin_lock(&server->srv_lock);
+ if (!server->leaf_fullpath) {
+ spin_unlock(&server->srv_lock);
return 0;
}
- spin_unlock(&tcon->tc_lock);
+ spin_unlock(&server->srv_lock);
/*
* Slow path - tcon is DFS and @full_path has prefix path, so attempt
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 689d8a506d45..48c27581ec51 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -378,6 +378,8 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
bool unicode, struct cifs_open_info_data *data)
{
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
data->reparse.buf = buf;
/* See MS-FSCC 2.1.2 */
@@ -394,12 +396,13 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
case IO_REPARSE_TAG_LX_FIFO:
case IO_REPARSE_TAG_LX_CHR:
case IO_REPARSE_TAG_LX_BLK:
- return 0;
+ break;
default:
- cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
- __func__, le32_to_cpu(buf->ReparseTag));
- return -EOPNOTSUPP;
+ cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n",
+ le32_to_cpu(buf->ReparseTag));
+ break;
}
+ return 0;
}
int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index e1f2feb56f45..e03c91a49650 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -1078,7 +1078,7 @@ cifs_make_node(unsigned int xid, struct inode *inode,
/*
* Check if mounted with mount parm 'sfu' mount parm.
* SFU emulation should work with all servers, but only
- * supports block and char device (no socket & fifo),
+ * supports block and char device, socket & fifo,
* and was used by default in earlier versions of Windows
*/
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index c23478ab1cf8..e301349b0078 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -21,7 +21,7 @@
#include "cifs_unicode.h"
#include "fscache.h"
#include "smb2proto.h"
-#include "smb2status.h"
+#include "../common/smb2status.h"
static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
{
@@ -196,9 +196,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifsLockInfo *li, *tmp;
__u64 length = 1 + flock->fl_end - flock->fl_start;
- struct list_head tmp_llist;
-
- INIT_LIST_HEAD(&tmp_llist);
+ LIST_HEAD(tmp_llist);
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 9f5bc41433c1..b992117377e9 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -24,7 +24,7 @@
#include "smb2pdu.h"
#include "smb2proto.h"
#include "cached_dir.h"
-#include "smb2status.h"
+#include "../common/smb2status.h"
static struct reparse_data_buffer *reparse_buf_ptr(struct kvec *iov)
{
@@ -315,7 +315,7 @@ replay_again:
SMB2_O_INFO_FILE, 0,
sizeof(struct smb311_posix_qinfo *) +
(PATH_MAX * 2) +
- (sizeof(struct cifs_sid) * 2), 0, NULL);
+ (sizeof(struct smb_sid) * 2), 0, NULL);
} else {
rc = SMB2_query_info_init(tcon, server,
&rqst[num_rqst],
@@ -325,7 +325,7 @@ replay_again:
SMB2_O_INFO_FILE, 0,
sizeof(struct smb311_posix_qinfo *) +
(PATH_MAX * 2) +
- (sizeof(struct cifs_sid) * 2), 0, NULL);
+ (sizeof(struct smb_sid) * 2), 0, NULL);
}
if (!rc && (!cfile || num_rqst > 1)) {
smb2_set_next_command(tcon, &rqst[num_rqst]);
@@ -1106,6 +1106,8 @@ int smb2_rename_path(const unsigned int xid,
co, DELETE, SMB2_OP_RENAME, cfile, source_dentry);
if (rc == -EINVAL) {
cifs_dbg(FYI, "invalid lease key, resending request without lease");
+ cifs_get_writable_path(tcon, from_name,
+ FIND_WR_WITH_DELETE, &cfile);
rc = smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
co, DELETE, SMB2_OP_RENAME, cfile, NULL);
}
@@ -1149,6 +1151,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
cfile, NULL, NULL, dentry);
if (rc == -EINVAL) {
cifs_dbg(FYI, "invalid lease key, resending request without lease");
+ cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb,
full_path, &oparms, &in_iov,
&(int){SMB2_OP_SET_EOF}, 1,
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index ac1895358908..b05313acf9b2 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -12,7 +12,7 @@
#include "cifs_debug.h"
#include "smb2pdu.h"
#include "smb2proto.h"
-#include "smb2status.h"
+#include "../common/smb2status.h"
#include "smb2glob.h"
#include "trace.h"
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 677ef6f99a5b..f3c4b70b77b9 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -13,7 +13,7 @@
#include "smb2proto.h"
#include "cifs_debug.h"
#include "cifs_unicode.h"
-#include "smb2status.h"
+#include "../common/smb2status.h"
#include "smb2glob.h"
#include "nterr.h"
#include "cached_dir.h"
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 322cabc69c6f..7381ec333c6d 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -13,6 +13,7 @@
#include <linux/sort.h>
#include <crypto/aead.h>
#include <linux/fiemap.h>
+#include <linux/folio_queue.h>
#include <uapi/linux/magic.h>
#include "cifsfs.h"
#include "cifsglob.h"
@@ -21,7 +22,7 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifs_unicode.h"
-#include "smb2status.h"
+#include "../common/smb2status.h"
#include "smb2glob.h"
#include "cifs_ioctl.h"
#include "smbdirect.h"
@@ -301,7 +302,8 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
unsigned int /*enum smb3_rw_credits_trace*/ trace)
{
struct cifs_credits *credits = &subreq->credits;
- int new_val = DIV_ROUND_UP(subreq->subreq.len, SMB2_MAX_BUFFER_SIZE);
+ int new_val = DIV_ROUND_UP(subreq->subreq.len - subreq->subreq.transferred,
+ SMB2_MAX_BUFFER_SIZE);
int scredits, in_flight;
if (!credits->value || credits->value == new_val)
@@ -316,7 +318,8 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
cifs_trace_rw_credits_no_adjust_up);
trace_smb3_too_many_credits(server->CurrentMid,
server->conn_id, server->hostname, 0, credits->value - new_val, 0);
- cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)",
+ cifs_server_dbg(VFS, "R=%x[%x] request has less credits (%d) than required (%d)",
+ subreq->rreq->debug_id, subreq->subreq.debug_index,
credits->value, new_val);
return -EOPNOTSUPP;
@@ -338,8 +341,9 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
trace_smb3_reconnect_detected(server->CurrentMid,
server->conn_id, server->hostname, scredits,
credits->value - new_val, in_flight);
- cifs_server_dbg(VFS, "trying to return %d credits to old session\n",
- credits->value - new_val);
+ cifs_server_dbg(VFS, "R=%x[%x] trying to return %d credits to old session\n",
+ subreq->rreq->debug_id, subreq->subreq.debug_index,
+ credits->value - new_val);
return -EAGAIN;
}
@@ -3046,11 +3050,11 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
return rc;
}
-static struct cifs_ntsd *
+static struct smb_ntsd *
get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen, u32 info)
{
- struct cifs_ntsd *pntsd = NULL;
+ struct smb_ntsd *pntsd = NULL;
unsigned int xid;
int rc = -EOPNOTSUPP;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -3075,11 +3079,11 @@ get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
}
-static struct cifs_ntsd *
+static struct smb_ntsd *
get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
const char *path, u32 *pacllen, u32 info)
{
- struct cifs_ntsd *pntsd = NULL;
+ struct smb_ntsd *pntsd = NULL;
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
unsigned int xid;
int rc;
@@ -3142,7 +3146,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
}
static int
-set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen,
struct inode *inode, const char *path, int aclflag)
{
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -3200,12 +3204,12 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
}
/* Retrieve an ACL from the server */
-static struct cifs_ntsd *
+static struct smb_ntsd *
get_smb2_acl(struct cifs_sb_info *cifs_sb,
struct inode *inode, const char *path,
u32 *pacllen, u32 info)
{
- struct cifs_ntsd *pntsd = NULL;
+ struct smb_ntsd *pntsd = NULL;
struct cifsFileInfo *open_file = NULL;
if (inode && !(info & SACL_SECINFO))
@@ -3237,13 +3241,15 @@ static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon,
}
static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
- loff_t offset, loff_t len, bool keep_size)
+ unsigned long long offset, unsigned long long len,
+ bool keep_size)
{
struct cifs_ses *ses = tcon->ses;
struct inode *inode = file_inode(file);
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct cifsFileInfo *cfile = file->private_data;
- unsigned long long new_size;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ unsigned long long i_size, new_size, remote_size;
long rc;
unsigned int xid;
@@ -3255,6 +3261,16 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
inode_lock(inode);
filemap_invalidate_lock(inode->i_mapping);
+ i_size = i_size_read(inode);
+ remote_size = ictx->remote_i_size;
+ if (offset + len >= remote_size && offset < i_size) {
+ unsigned long long top = umin(offset + len, i_size);
+
+ rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1);
+ if (rc < 0)
+ goto zero_range_exit;
+ }
+
/*
* We zero the range through ioctl, so we need remove the page caches
* first, otherwise the data may be inconsistent with the server.
@@ -3305,6 +3321,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
+ unsigned long long end = offset + len, i_size, remote_i_size;
long rc;
unsigned int xid;
__u8 set_sparse = 1;
@@ -3336,6 +3353,27 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
(char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
+
+ if (rc)
+ goto unlock;
+
+ /* If there's dirty data in the buffer that would extend the EOF if it
+ * were written, then we need to move the EOF marker over to the lower
+ * of the high end of the hole and the proposed EOF. The problem is
+ * that we locally hole-punch the tail of the dirty data, the proposed
+ * EOF update will end up in the wrong place.
+ */
+ i_size = i_size_read(inode);
+ remote_i_size = netfs_inode(inode)->remote_i_size;
+ if (end > remote_i_size && i_size > remote_i_size) {
+ unsigned long long extend_to = umin(end, i_size);
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid, extend_to);
+ if (rc >= 0)
+ netfs_inode(inode)->remote_i_size = extend_to;
+ }
+
+unlock:
filemap_invalidate_unlock(inode->i_mapping);
out:
inode_unlock(inode);
@@ -4356,30 +4394,86 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
}
/*
- * Clear a read buffer, discarding the folios which have XA_MARK_0 set.
+ * Clear a read buffer, discarding the folios which have the 1st mark set.
+ */
+static void cifs_clear_folioq_buffer(struct folio_queue *buffer)
+{
+ struct folio_queue *folioq;
+
+ while ((folioq = buffer)) {
+ for (int s = 0; s < folioq_count(folioq); s++)
+ if (folioq_is_marked(folioq, s))
+ folio_put(folioq_folio(folioq, s));
+ buffer = folioq->next;
+ kfree(folioq);
+ }
+}
+
+/*
+ * Allocate buffer space into a folio queue.
*/
-static void cifs_clear_xarray_buffer(struct xarray *buffer)
+static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size)
{
+ struct folio_queue *buffer = NULL, *tail = NULL, *p;
struct folio *folio;
+ unsigned int slot;
+
+ do {
+ if (!tail || folioq_full(tail)) {
+ p = kmalloc(sizeof(*p), GFP_NOFS);
+ if (!p)
+ goto nomem;
+ folioq_init(p);
+ if (tail) {
+ tail->next = p;
+ p->prev = tail;
+ } else {
+ buffer = p;
+ }
+ tail = p;
+ }
+
+ folio = folio_alloc(GFP_KERNEL|__GFP_HIGHMEM, 0);
+ if (!folio)
+ goto nomem;
- XA_STATE(xas, buffer, 0);
+ slot = folioq_append_mark(tail, folio);
+ size -= folioq_folio_size(tail, slot);
+ } while (size > 0);
+
+ return buffer;
+
+nomem:
+ cifs_clear_folioq_buffer(buffer);
+ return NULL;
+}
+
+/*
+ * Copy data from an iterator to the folios in a folio queue buffer.
+ */
+static bool cifs_copy_iter_to_folioq(struct iov_iter *iter, size_t size,
+ struct folio_queue *buffer)
+{
+ for (; buffer; buffer = buffer->next) {
+ for (int s = 0; s < folioq_count(buffer); s++) {
+ struct folio *folio = folioq_folio(buffer, s);
+ size_t part = folioq_folio_size(buffer, s);
- rcu_read_lock();
- xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) {
- folio_put(folio);
+ part = umin(part, size);
+
+ if (copy_folio_from_iter(folio, 0, part, iter) != part)
+ return false;
+ size -= part;
+ }
}
- rcu_read_unlock();
- xa_destroy(buffer);
+ return true;
}
void
smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst)
{
- int i;
-
- for (i = 0; i < num_rqst; i++)
- if (!xa_empty(&rqst[i].rq_buffer))
- cifs_clear_xarray_buffer(&rqst[i].rq_buffer);
+ for (int i = 0; i < num_rqst; i++)
+ cifs_clear_folioq_buffer(rqst[i].rq_buffer);
}
/*
@@ -4400,53 +4494,32 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *new_rq, struct smb_rqst *old_rq)
{
struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base;
- struct page *page;
unsigned int orig_len = 0;
- int i, j;
int rc = -ENOMEM;
- for (i = 1; i < num_rqst; i++) {
+ for (int i = 1; i < num_rqst; i++) {
struct smb_rqst *old = &old_rq[i - 1];
struct smb_rqst *new = &new_rq[i];
- struct xarray *buffer = &new->rq_buffer;
- size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0;
+ struct folio_queue *buffer;
+ size_t size = iov_iter_count(&old->rq_iter);
orig_len += smb_rqst_len(server, old);
new->rq_iov = old->rq_iov;
new->rq_nvec = old->rq_nvec;
- xa_init(buffer);
-
if (size > 0) {
- unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE);
-
- for (j = 0; j < npages; j++) {
- void *o;
-
- rc = -ENOMEM;
- page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
- if (!page)
- goto err_free;
- page->index = j;
- o = xa_store(buffer, j, page, GFP_KERNEL);
- if (xa_is_err(o)) {
- rc = xa_err(o);
- put_page(page);
- goto err_free;
- }
+ buffer = cifs_alloc_folioq_buffer(size);
+ if (!buffer)
+ goto err_free;
- xa_set_mark(buffer, j, XA_MARK_0);
+ new->rq_buffer = buffer;
+ iov_iter_folio_queue(&new->rq_iter, ITER_SOURCE,
+ buffer, 0, 0, size);
- seg = min_t(size_t, size - copied, PAGE_SIZE);
- if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) {
- rc = -EFAULT;
- goto err_free;
- }
- copied += seg;
+ if (!cifs_copy_iter_to_folioq(&old->rq_iter, size, buffer)) {
+ rc = -EIO;
+ goto err_free;
}
- iov_iter_xarray(&new->rq_iter, ITER_SOURCE,
- buffer, 0, size);
- new->rq_iter_size = size;
}
}
@@ -4492,7 +4565,6 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
rqst.rq_nvec = 2;
if (iter) {
rqst.rq_iter = *iter;
- rqst.rq_iter_size = iov_iter_count(iter);
iter_size = iov_iter_count(iter);
}
@@ -4511,22 +4583,23 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
}
static int
-cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size,
- unsigned int skip, struct iov_iter *iter)
+cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size,
+ size_t skip, struct iov_iter *iter)
{
- struct page *page;
- unsigned long index;
-
- xa_for_each(pages, index, page) {
- size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size);
-
- n = copy_page_to_iter(page, skip, len, iter);
- if (n != len) {
- cifs_dbg(VFS, "%s: something went wrong\n", __func__);
- return -EIO;
+ for (; folioq; folioq = folioq->next) {
+ for (int s = 0; s < folioq_count(folioq); s++) {
+ struct folio *folio = folioq_folio(folioq, s);
+ size_t fsize = folio_size(folio);
+ size_t n, len = umin(fsize - skip, data_size);
+
+ n = copy_folio_to_iter(folio, skip, len, iter);
+ if (n != len) {
+ cifs_dbg(VFS, "%s: something went wrong\n", __func__);
+ return -EIO;
+ }
+ data_size -= n;
+ skip = 0;
}
- data_size -= n;
- skip = 0;
}
return 0;
@@ -4534,8 +4607,8 @@ cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size,
static int
handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
- char *buf, unsigned int buf_len, struct xarray *pages,
- unsigned int pages_len, bool is_offloaded)
+ char *buf, unsigned int buf_len, struct folio_queue *buffer,
+ unsigned int buffer_len, bool is_offloaded)
{
unsigned int data_offset;
unsigned int data_len;
@@ -4632,7 +4705,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- if (data_len > pages_len - pad_len) {
+ if (data_len > buffer_len - pad_len) {
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
if (is_offloaded)
@@ -4643,8 +4716,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
}
/* Copy the data to the output I/O iterator. */
- rdata->result = cifs_copy_pages_to_iter(pages, pages_len,
- cur_off, &rdata->subreq.io_iter);
+ rdata->result = cifs_copy_folioq_to_iter(buffer, buffer_len,
+ cur_off, &rdata->subreq.io_iter);
if (rdata->result != 0) {
if (is_offloaded)
mid->mid_state = MID_RESPONSE_MALFORMED;
@@ -4652,12 +4725,11 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
dequeue_mid(mid, rdata->result);
return 0;
}
- rdata->got_bytes = pages_len;
+ rdata->got_bytes = buffer_len;
} else if (buf_len >= data_offset + data_len) {
/* read response payload is in buf */
- WARN_ONCE(pages && !xa_empty(pages),
- "read data can be either in buf or in pages");
+ WARN_ONCE(buffer, "read data can be either in buf or in buffer");
length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter);
if (length < 0)
return length;
@@ -4683,7 +4755,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
struct smb2_decrypt_work {
struct work_struct decrypt;
struct TCP_Server_Info *server;
- struct xarray buffer;
+ struct folio_queue *buffer;
char *buf;
unsigned int len;
};
@@ -4697,7 +4769,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
struct mid_q_entry *mid;
struct iov_iter iter;
- iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len);
+ iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, dw->len);
rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size,
&iter, true);
if (rc) {
@@ -4713,7 +4785,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
mid->decrypted = true;
rc = handle_read_data(dw->server, mid, dw->buf,
dw->server->vals->read_rsp_size,
- &dw->buffer, dw->len,
+ dw->buffer, dw->len,
true);
if (rc >= 0) {
#ifdef CONFIG_CIFS_STATS2
@@ -4746,7 +4818,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
}
free_pages:
- cifs_clear_xarray_buffer(&dw->buffer);
+ cifs_clear_folioq_buffer(dw->buffer);
cifs_small_buf_release(dw->buf);
kfree(dw);
}
@@ -4756,20 +4828,17 @@ static int
receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
int *num_mids)
{
- struct page *page;
char *buf = server->smallbuf;
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
struct iov_iter iter;
- unsigned int len, npages;
+ unsigned int len;
unsigned int buflen = server->pdu_size;
int rc;
- int i = 0;
struct smb2_decrypt_work *dw;
dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL);
if (!dw)
return -ENOMEM;
- xa_init(&dw->buffer);
INIT_WORK(&dw->decrypt, smb2_decrypt_offload);
dw->server = server;
@@ -4785,26 +4854,14 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
len = le32_to_cpu(tr_hdr->OriginalMessageSize) -
server->vals->read_rsp_size;
dw->len = len;
- npages = DIV_ROUND_UP(len, PAGE_SIZE);
+ len = round_up(dw->len, PAGE_SIZE);
rc = -ENOMEM;
- for (; i < npages; i++) {
- void *old;
-
- page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
- if (!page)
- goto discard_data;
- page->index = i;
- old = xa_store(&dw->buffer, i, page, GFP_KERNEL);
- if (xa_is_err(old)) {
- rc = xa_err(old);
- put_page(page);
- goto discard_data;
- }
- xa_set_mark(&dw->buffer, i, XA_MARK_0);
- }
+ dw->buffer = cifs_alloc_folioq_buffer(len);
+ if (!dw->buffer)
+ goto discard_data;
- iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE);
+ iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, len);
/* Read the data into the buffer and clear excess bufferage. */
rc = cifs_read_iter_from_socket(server, &iter, dw->len);
@@ -4812,9 +4869,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
goto discard_data;
server->total_read += rc;
- if (rc < npages * PAGE_SIZE)
- iov_iter_zero(npages * PAGE_SIZE - rc, &iter);
- iov_iter_revert(&iter, npages * PAGE_SIZE);
+ if (rc < len)
+ iov_iter_zero(len - rc, &iter);
+ iov_iter_revert(&iter, len);
iov_iter_truncate(&iter, dw->len);
rc = cifs_discard_remaining_data(server);
@@ -4849,7 +4906,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
(*mid)->decrypted = true;
rc = handle_read_data(server, *mid, buf,
server->vals->read_rsp_size,
- &dw->buffer, dw->len, false);
+ dw->buffer, dw->len, false);
if (rc >= 0) {
if (server->ops->is_network_name_deleted) {
server->ops->is_network_name_deleted(buf,
@@ -4859,7 +4916,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
}
free_pages:
- cifs_clear_xarray_buffer(&dw->buffer);
+ cifs_clear_folioq_buffer(dw->buffer);
free_dw:
kfree(dw);
return rc;
@@ -5021,9 +5078,10 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf,
return 0;
}
-static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname)
{
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
@@ -5031,30 +5089,64 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_fid fid;
unsigned int bytes_written;
- struct win_dev pdev = {};
- struct kvec iov[2];
+ u8 type[8];
+ int type_len = 0;
+ struct {
+ __le64 major;
+ __le64 minor;
+ } __packed pdev = {};
+ __le16 *symname_utf16 = NULL;
+ u8 *data = NULL;
+ int data_len = 0;
+ struct kvec iov[3];
__u32 oplock = server->oplocks ? REQ_OPLOCK : 0;
int rc;
switch (mode & S_IFMT) {
case S_IFCHR:
- strscpy(pdev.type, "IntxCHR");
+ type_len = 8;
+ memcpy(type, "IntxCHR\0", type_len);
pdev.major = cpu_to_le64(MAJOR(dev));
pdev.minor = cpu_to_le64(MINOR(dev));
+ data = (u8 *)&pdev;
+ data_len = sizeof(pdev);
break;
case S_IFBLK:
- strscpy(pdev.type, "IntxBLK");
+ type_len = 8;
+ memcpy(type, "IntxBLK\0", type_len);
pdev.major = cpu_to_le64(MAJOR(dev));
pdev.minor = cpu_to_le64(MINOR(dev));
+ data = (u8 *)&pdev;
+ data_len = sizeof(pdev);
+ break;
+ case S_IFLNK:
+ type_len = 8;
+ memcpy(type, "IntxLNK\1", type_len);
+ symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname),
+ &data_len, cifs_sb->local_nls,
+ NO_MAP_UNI_RSVD);
+ if (!symname_utf16) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ data_len -= 2; /* symlink is without trailing wide-nul */
+ data = (u8 *)symname_utf16;
break;
case S_IFSOCK:
- strscpy(pdev.type, "LnxSOCK");
+ type_len = 8;
+ strscpy(type, "LnxSOCK");
+ data = (u8 *)&pdev;
+ data_len = sizeof(pdev);
break;
case S_IFIFO:
- strscpy(pdev.type, "LnxFIFO");
+ type_len = 8;
+ strscpy(type, "LnxFIFO");
+ data = (u8 *)&pdev;
+ data_len = sizeof(pdev);
break;
default:
- return -EPERM;
+ rc = -EPERM;
+ goto out;
}
oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE,
@@ -5064,17 +5156,26 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
rc = server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
- return rc;
+ goto out;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.length = sizeof(pdev);
- iov[1].iov_base = &pdev;
- iov[1].iov_len = sizeof(pdev);
+ if (type_len + data_len > 0) {
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.length = type_len + data_len;
+ iov[1].iov_base = type;
+ iov[1].iov_len = type_len;
+ iov[2].iov_base = data;
+ iov[2].iov_len = data_len;
+
+ rc = server->ops->sync_write(xid, &fid, &io_parms,
+ &bytes_written,
+ iov, ARRAY_SIZE(iov)-1);
+ }
- rc = server->ops->sync_write(xid, &fid, &io_parms,
- &bytes_written, iov, 1);
server->ops->close(xid, tcon, &fid);
+
+out:
+ kfree(symname_utf16);
return rc;
}
@@ -5086,7 +5187,7 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
int rc;
rc = __cifs_sfu_make_node(xid, inode, dentry, tcon,
- full_path, mode, dev);
+ full_path, mode, dev, NULL);
if (rc)
return rc;
@@ -5115,7 +5216,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
/*
* Check if mounted with mount parm 'sfu' mount parm.
* SFU emulation should work with all servers, but only
- * supports block and char device (no socket & fifo),
+ * supports block and char device, socket & fifo,
* and was used by default in earlier versions of Windows
*/
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 9a06b5594669..2cb1bf65a172 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -32,7 +32,7 @@
#include "cifs_unicode.h"
#include "cifs_debug.h"
#include "ntlmssp.h"
-#include "smb2status.h"
+#include "../common/smb2status.h"
#include "smb2glob.h"
#include "cifspdu.h"
#include "cifs_spnego.h"
@@ -42,6 +42,7 @@
#include "dfs_cache.h"
#endif
#include "cached_dir.h"
+#include "compress.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -82,6 +83,9 @@ int smb3_encryption_required(const struct cifs_tcon *tcon)
if (tcon->seal &&
(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
return 1;
+ if (((global_secflags & CIFSSEC_MUST_SEAL) == CIFSSEC_MUST_SEAL) &&
+ (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+ return 1;
return 0;
}
@@ -2620,7 +2624,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
unsigned int group_offset = 0;
struct smb3_acl acl = {};
- *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
+ *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct smb_ace) * 4), 8);
if (set_owner) {
/* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */
@@ -2669,21 +2673,21 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
ptr += sizeof(struct smb3_acl);
/* create one ACE to hold the mode embedded in reserved special SID */
- acelen = setup_special_mode_ACE((struct cifs_ace *)ptr, (__u64)mode);
+ acelen = setup_special_mode_ACE((struct smb_ace *)ptr, (__u64)mode);
ptr += acelen;
acl_size = acelen + sizeof(struct smb3_acl);
ace_count = 1;
if (set_owner) {
/* we do not need to reallocate buffer to add the two more ACEs. plenty of space */
- acelen = setup_special_user_owner_ACE((struct cifs_ace *)ptr);
+ acelen = setup_special_user_owner_ACE((struct smb_ace *)ptr);
ptr += acelen;
acl_size += acelen;
ace_count += 1;
}
/* and one more ACE to allow access for authenticated users */
- acelen = setup_authusers_ACE((struct cifs_ace *)ptr);
+ acelen = setup_authusers_ACE((struct smb_ace *)ptr);
ptr += acelen;
acl_size += acelen;
ace_count += 1;
@@ -3903,7 +3907,7 @@ SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, struct smb311_posix_qinfo *data, u32 *plen)
{
size_t output_len = sizeof(struct smb311_posix_qinfo *) +
- (sizeof(struct cifs_sid) * 2) + (PATH_MAX * 2);
+ (sizeof(struct smb_sid) * 2) + (PATH_MAX * 2);
*plen = 0;
return query_info(xid, tcon, persistent_fid, volatile_fid,
@@ -4438,7 +4442,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* If we want to do a RDMA write, fill in and append
* smbd_buffer_descriptor_v1 to the end of read request
*/
- if (smb3_use_rdma_offload(io_parms)) {
+ if (rdata && smb3_use_rdma_offload(io_parms)) {
struct smbd_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
@@ -4495,15 +4499,14 @@ static void smb2_readv_worker(struct work_struct *work)
struct cifs_io_subrequest *rdata =
container_of(work, struct cifs_io_subrequest, subreq.work);
- netfs_subreq_terminated(&rdata->subreq,
- (rdata->result == 0 || rdata->result == -EAGAIN) ?
- rdata->got_bytes : rdata->result, true);
+ netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false);
}
static void
smb2_readv_callback(struct mid_q_entry *mid)
{
struct cifs_io_subrequest *rdata = mid->callback_data;
+ struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode);
struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
struct TCP_Server_Info *server = rdata->server;
struct smb2_hdr *shdr =
@@ -4520,16 +4523,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->got_bytes) {
rqst.rq_iter = rdata->subreq.io_iter;
- rqst.rq_iter_size = iov_iter_count(&rdata->subreq.io_iter);
}
WARN_ONCE(rdata->server != mid->server,
"rdata server %p != mid server %p",
rdata->server, mid->server);
- cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
+ cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n",
__func__, mid->mid, mid->mid_state, rdata->result,
- rdata->subreq.len);
+ rdata->got_bytes, rdata->subreq.len - rdata->subreq.transferred);
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
@@ -4551,6 +4553,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
+ __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
rdata->result = -EAGAIN;
if (server->sign && rdata->got_bytes)
/* reset bytes number since we can not check a sign */
@@ -4583,27 +4586,36 @@ smb2_readv_callback(struct mid_q_entry *mid)
rdata->subreq.debug_index,
rdata->xid,
rdata->req->cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, rdata->subreq.start,
- rdata->subreq.len, rdata->result);
+ tcon->tid, tcon->ses->Suid,
+ rdata->subreq.start + rdata->subreq.transferred,
+ rdata->subreq.len - rdata->subreq.transferred,
+ rdata->result);
} else
trace_smb3_read_done(rdata->rreq->debug_id,
rdata->subreq.debug_index,
rdata->xid,
rdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid,
- rdata->subreq.start, rdata->got_bytes);
+ rdata->subreq.start + rdata->subreq.transferred,
+ rdata->got_bytes);
if (rdata->result == -ENODATA) {
- /* We may have got an EOF error because fallocate
- * failed to enlarge the file.
- */
- if (rdata->subreq.start < rdata->subreq.rreq->i_size)
+ __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+ rdata->result = 0;
+ } else {
+ size_t trans = rdata->subreq.transferred + rdata->got_bytes;
+ if (trans < rdata->subreq.len &&
+ rdata->subreq.start + trans == ictx->remote_i_size) {
+ __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0;
+ }
}
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
server->credits, server->in_flight,
0, cifs_trace_rw_credits_read_response_clear);
rdata->credits.value = 0;
+ rdata->subreq.transferred += rdata->got_bytes;
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress);
INIT_WORK(&rdata->subreq.work, smb2_readv_worker);
queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid);
@@ -4619,6 +4631,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
{
int rc, flags = 0;
char *buf;
+ struct netfs_io_subrequest *subreq = &rdata->subreq;
struct smb2_hdr *shdr;
struct cifs_io_parms io_parms;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
@@ -4629,15 +4642,15 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
int credit_request;
cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n",
- __func__, rdata->subreq.start, rdata->subreq.len);
+ __func__, subreq->start, subreq->len);
if (!rdata->server)
rdata->server = cifs_pick_channel(tcon->ses);
io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink);
io_parms.server = server = rdata->server;
- io_parms.offset = rdata->subreq.start;
- io_parms.length = rdata->subreq.len;
+ io_parms.offset = subreq->start + subreq->transferred;
+ io_parms.length = subreq->len - subreq->transferred;
io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid;
io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid;
io_parms.pid = rdata->req->pid;
@@ -4652,11 +4665,13 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
rdata->iov[0].iov_base = buf;
rdata->iov[0].iov_len = total_len;
+ rdata->got_bytes = 0;
+ rdata->result = 0;
shdr = (struct smb2_hdr *)buf;
if (rdata->credits.value > 0) {
- shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->subreq.len,
+ shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(io_parms.length,
SMB2_MAX_BUFFER_SIZE));
credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
if (server->credits >= server->max_credits)
@@ -4680,11 +4695,12 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
if (rc) {
cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
trace_smb3_read_err(rdata->rreq->debug_id,
- rdata->subreq.debug_index,
+ subreq->debug_index,
rdata->xid, io_parms.persistent_fid,
io_parms.tcon->tid,
io_parms.tcon->ses->Suid,
- io_parms.offset, io_parms.length, rc);
+ io_parms.offset,
+ subreq->len - subreq->transferred, rc);
}
async_readv_out:
@@ -4867,6 +4883,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
server->credits, server->in_flight,
0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
cifs_write_subrequest_terminated(wdata, result ?: written, true);
release_mid(mid);
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
@@ -4911,6 +4928,13 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
if (rc)
goto out;
+ rqst.rq_iov = iov;
+ rqst.rq_iter = wdata->subreq.io_iter;
+
+ rqst.rq_iov[0].iov_len = total_len - 1;
+ rqst.rq_iov[0].iov_base = (char *)req;
+ rqst.rq_nvec += 1;
+
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -4922,6 +4946,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
req->Channel = SMB2_CHANNEL_NONE;
+ req->Length = cpu_to_le32(io_parms->length);
req->Offset = cpu_to_le64(io_parms->offset);
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer));
@@ -4941,7 +4966,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
*/
if (smb3_use_rdma_offload(io_parms)) {
struct smbd_buffer_descriptor_v1 *v1;
- size_t data_size = iov_iter_count(&wdata->subreq.io_iter);
bool need_invalidate = server->dialect == SMB30_PROT_ID;
wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter,
@@ -4950,9 +4974,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
rc = -EAGAIN;
goto async_writev_out;
}
+ /* For RDMA read, I/O size is in RemainingBytes not in Length */
+ req->RemainingBytes = req->Length;
req->Length = 0;
req->DataOffset = 0;
- req->RemainingBytes = cpu_to_le32(data_size);
req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
if (need_invalidate)
req->Channel = SMB2_CHANNEL_RDMA_V1;
@@ -4964,31 +4989,22 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
v1->offset = cpu_to_le64(wdata->mr->mr->iova);
v1->token = cpu_to_le32(wdata->mr->mr->rkey);
v1->length = cpu_to_le32(wdata->mr->mr->length);
+
+ rqst.rq_iov[0].iov_len += sizeof(*v1);
+
+ /*
+ * We keep wdata->subreq.io_iter,
+ * but we have to truncate rqst.rq_iter
+ */
+ iov_iter_truncate(&rqst.rq_iter, 0);
}
#endif
- iov[0].iov_len = total_len - 1;
- iov[0].iov_base = (char *)req;
- rqst.rq_iov = iov;
- rqst.rq_nvec = 1;
- rqst.rq_iter = wdata->subreq.io_iter;
- rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
smb2_set_replay(server, &rqst);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr)
- iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
-#endif
- cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n",
- io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter));
-#ifdef CONFIG_CIFS_SMB_DIRECT
- /* For RDMA read, I/O size is in RemainingBytes not in Length */
- if (!wdata->mr)
- req->Length = cpu_to_le32(io_parms->length);
-#else
- req->Length = cpu_to_le32(io_parms->length);
-#endif
+ cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n",
+ io_parms->offset, io_parms->length, iov_iter_count(&wdata->subreq.io_iter));
if (wdata->credits.value > 0) {
shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len,
@@ -5008,6 +5024,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
flags |= CIFS_HAS_CREDITS;
}
+ /* XXX: compression + encryption is unsupported for now */
+ if (((flags & CIFS_TRANSFORM_REQ) != CIFS_TRANSFORM_REQ) && should_compress(tcon, &rqst))
+ flags |= CIFS_COMPRESS_REQ;
+
rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
wdata, flags, &wdata->credits);
/* Can't touch wdata if rc == 0 */
@@ -5671,7 +5691,7 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
int
SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
- struct cifs_ntsd *pnntsd, int pacllen, int aclflag)
+ struct smb_ntsd *pnntsd, int pacllen, int aclflag)
{
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
current->tgid, 0, SMB2_O_INFO_SECURITY, aclflag,
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index 5c458ab3b05a..076d9e83e1a0 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -364,8 +364,8 @@ struct create_posix_rsp {
u32 nlink;
u32 reparse_tag;
u32 mode;
- struct cifs_sid owner; /* var-sized on the wire */
- struct cifs_sid group; /* var-sized on the wire */
+ struct smb_sid owner; /* var-sized on the wire */
+ struct smb_sid group; /* var-sized on the wire */
} __packed;
#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2
@@ -408,8 +408,8 @@ struct smb2_posix_info {
struct smb2_posix_info_parsed {
const struct smb2_posix_info *base;
size_t size;
- struct cifs_sid owner;
- struct cifs_sid group;
+ struct smb_sid owner;
+ struct smb_sid group;
int name_len;
const u8 *name;
};
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index b208232b12a2..c7e1b149877a 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -238,7 +238,7 @@ extern int SMB2_set_info_init(struct cifs_tcon *tcon,
extern void SMB2_set_info_free(struct smb_rqst *rqst);
extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
- struct cifs_ntsd *pnntsd, int pacllen, int aclflag);
+ struct smb_ntsd *pnntsd, int pacllen, int aclflag);
extern int SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
struct smb2_file_full_ea_info *buf, int len);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 1476c445cadc..e4636fca821d 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -23,7 +23,7 @@
#include "cifsproto.h"
#include "smb2proto.h"
#include "cifs_debug.h"
-#include "smb2status.h"
+#include "../common/smb2status.h"
#include "smb2glob.h"
static int
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index d74e829de51c..0c64b37e2660 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -6,6 +6,7 @@
*/
#include <linux/module.h>
#include <linux/highmem.h>
+#include <linux/folio_queue.h>
#include "smbdirect.h"
#include "cifs_debug.h"
#include "cifsproto.h"
@@ -406,7 +407,7 @@ static void smbd_post_send_credits(struct work_struct *work)
else
response = get_empty_queue_buffer(info);
if (!response) {
- /* now switch to emtpy packet queue */
+ /* now switch to empty packet queue */
if (use_receive_queue) {
use_receive_queue = 0;
continue;
@@ -618,7 +619,7 @@ out:
/*
* Test if FRWR (Fast Registration Work Requests) is supported on the device
- * This implementation requries FRWR on RDMA read/write
+ * This implementation requires FRWR on RDMA read/write
* return value: true if it is supported
*/
static bool frwr_is_supported(struct ib_device_attr *attrs)
@@ -1585,10 +1586,8 @@ static struct smbd_connection *_smbd_get_connection(
conn_param.initiator_depth = 0;
conn_param.responder_resources =
- info->id->device->attrs.max_qp_rd_atom
- < SMBD_CM_RESPONDER_RESOURCES ?
- info->id->device->attrs.max_qp_rd_atom :
- SMBD_CM_RESPONDER_RESOURCES;
+ min(info->id->device->attrs.max_qp_rd_atom,
+ SMBD_CM_RESPONDER_RESOURCES);
info->responder_resources = conn_param.responder_resources;
log_rdma_mr(INFO, "responder_resources=%d\n",
info->responder_resources);
@@ -2177,7 +2176,7 @@ cleanup_entries:
* MR available in the list. It may access the list while the
* smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
* as they never modify the same places. However, there may be several CPUs
- * issueing I/O trying to get MR at the same time, mr_list_lock is used to
+ * issuing I/O trying to get MR at the same time, mr_list_lock is used to
* protect this situation.
*/
static struct smbd_mr *get_mr(struct smbd_connection *info)
@@ -2311,7 +2310,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
/*
* There is no need for waiting for complemtion on ib_post_send
* on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
- * on the next ib_post_send when we actaully send I/O to remote peer
+ * on the next ib_post_send when we actually send I/O to remote peer
*/
rc = ib_post_send(info->id->qp, &reg_wr->wr, NULL);
if (!rc)
@@ -2463,6 +2462,8 @@ static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
start = 0;
}
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
return ret;
}
@@ -2519,50 +2520,65 @@ static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
start = 0;
}
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
return ret;
}
/*
- * Extract folio fragments from an XARRAY-class iterator and add them to an
- * RDMA list. The folios are not pinned.
+ * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
+ * list. The folios are not pinned.
*/
-static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter,
+static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
struct smb_extract_to_rdma *rdma,
ssize_t maxsize)
{
- struct xarray *xa = iter->xarray;
- struct folio *folio;
- loff_t start = iter->xarray_start + iter->iov_offset;
- pgoff_t index = start / PAGE_SIZE;
+ const struct folio_queue *folioq = iter->folioq;
+ unsigned int slot = iter->folioq_slot;
ssize_t ret = 0;
- size_t off, len;
- XA_STATE(xas, xa, index);
+ size_t offset = iter->iov_offset;
- rcu_read_lock();
+ BUG_ON(!folioq);
- xas_for_each(&xas, folio, ULONG_MAX) {
- if (xas_retry(&xas, folio))
- continue;
- if (WARN_ON(xa_is_value(folio)))
- break;
- if (WARN_ON(folio_test_hugetlb(folio)))
- break;
+ if (slot >= folioq_nr_slots(folioq)) {
+ folioq = folioq->next;
+ if (WARN_ON_ONCE(!folioq))
+ return -EIO;
+ slot = 0;
+ }
- off = offset_in_folio(folio, start);
- len = min_t(size_t, maxsize, folio_size(folio) - off);
+ do {
+ struct folio *folio = folioq_folio(folioq, slot);
+ size_t fsize = folioq_folio_size(folioq, slot);
- if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) {
- rcu_read_unlock();
- return -EIO;
+ if (offset < fsize) {
+ size_t part = umin(maxsize - ret, fsize - offset);
+
+ if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
+ return -EIO;
+
+ offset += part;
+ ret += part;
}
- maxsize -= len;
- ret += len;
- if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
- break;
- }
+ if (offset >= fsize) {
+ offset = 0;
+ slot++;
+ if (slot >= folioq_nr_slots(folioq)) {
+ if (!folioq->next) {
+ WARN_ON_ONCE(ret < iter->count);
+ break;
+ }
+ folioq = folioq->next;
+ slot = 0;
+ }
+ }
+ } while (rdma->nr_sge < rdma->max_sge || maxsize > 0);
- rcu_read_unlock();
+ iter->folioq = folioq;
+ iter->folioq_slot = slot;
+ iter->iov_offset = offset;
+ iter->count -= ret;
return ret;
}
@@ -2590,17 +2606,15 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
case ITER_KVEC:
ret = smb_extract_kvec_to_rdma(iter, rdma, len);
break;
- case ITER_XARRAY:
- ret = smb_extract_xarray_to_rdma(iter, rdma, len);
+ case ITER_FOLIOQ:
+ ret = smb_extract_folioq_to_rdma(iter, rdma, len);
break;
default:
WARN_ON_ONCE(1);
return -EIO;
}
- if (ret > 0) {
- iov_iter_advance(iter, ret);
- } else if (ret < 0) {
+ if (ret < 0) {
while (rdma->nr_sge > before) {
struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 0f0c10c7ada7..8e9964001e2a 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -30,6 +30,7 @@
EM(cifs_trace_rw_credits_old_session, "old-session") \
EM(cifs_trace_rw_credits_read_response_add, "rd-resp-add") \
EM(cifs_trace_rw_credits_read_response_clear, "rd-resp-clr") \
+ EM(cifs_trace_rw_credits_read_resubmit, "rd-resubmit") \
EM(cifs_trace_rw_credits_read_submit, "rd-submit ") \
EM(cifs_trace_rw_credits_write_prepare, "wr-prepare ") \
EM(cifs_trace_rw_credits_write_response_add, "wr-resp-add") \
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index adfe0d058701..fd5a85d43759 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -28,6 +28,7 @@
#include "cifs_debug.h"
#include "smb2proto.h"
#include "smbdirect.h"
+#include "compress.h"
/* Max number of iovectors we can use off the stack when sending requests. */
#define CIFS_MAX_IOV_SIZE 8
@@ -432,6 +433,9 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
struct kvec *iov;
int rc;
+ if (flags & CIFS_COMPRESS_REQ)
+ return smb_compress(server, &rqst[0], __smb_send_rqst);
+
if (!(flags & CIFS_TRANSFORM_REQ))
return __smb_send_rqst(server, num_rqst, rqst);
@@ -1289,7 +1293,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
out:
/*
* This will dequeue all mids. After this it is important that the
- * demultiplex_thread will not process any of these mids any futher.
+ * demultiplex_thread will not process any of these mids any further.
* This is prevented above by using a noop callback that will not
* wake this thread except for the very last PDU.
*/
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 6780aa3e98a1..58a584f0b27e 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -162,7 +162,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
case XATTR_CIFS_ACL:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
- struct cifs_ntsd *pacl;
+ struct smb_ntsd *pacl;
if (!value)
goto out;
@@ -315,7 +315,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
* fetch owner and DACL otherwise
*/
u32 acllen, extra_info;
- struct cifs_ntsd *pacl;
+ struct smb_ntsd *pacl;
if (pTcon->ses->server->ops->get_acl == NULL)
goto out; /* rc already EOPNOTSUPP */
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index c3ee42188d25..c769f9dbc0b4 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -1216,6 +1216,8 @@ struct create_context {
);
__u8 Buffer[];
} __packed;
+static_assert(offsetof(struct create_context, Buffer) == sizeof(struct create_context_hdr),
+ "struct member likely outside of __struct_group()");
struct smb2_create_req {
struct smb2_hdr hdr;
diff --git a/fs/smb/client/smb2status.h b/fs/smb/common/smb2status.h
index 9c6d79b0bd49..14b4a5f04564 100644
--- a/fs/smb/client/smb2status.h
+++ b/fs/smb/common/smb2status.h
@@ -901,6 +901,10 @@ struct ntstatus {
#define STATUS_DEVICE_ENUMERATION_ERROR cpu_to_le32(0xC0000366)
#define STATUS_MOUNT_POINT_NOT_RESOLVED cpu_to_le32(0xC0000368)
#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER cpu_to_le32(0xC0000369)
+/*
+ * 'OCCURED' is typo in MS-ERREF, it should be 'OCCURRED',
+ * but we'll keep it consistent with MS-ERREF.
+ */
#define STATUS_MCA_OCCURED cpu_to_le32(0xC000036A)
#define STATUS_DRIVER_BLOCKED_CRITICAL cpu_to_le32(0xC000036B)
#define STATUS_DRIVER_BLOCKED cpu_to_le32(0xC000036C)
@@ -1769,3 +1773,5 @@ struct ntstatus {
#define STATUS_IPSEC_INVALID_PACKET cpu_to_le32(0xC0360005)
#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED cpu_to_le32(0xC0360006)
#define STATUS_IPSEC_CLEAR_TEXT_DROP cpu_to_le32(0xC0360007)
+#define STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP cpu_to_le32(0xC05D0000)
+#define STATUS_INVALID_LOCK_RANGE cpu_to_le32(0xC00001a1)
diff --git a/fs/smb/common/smbacl.h b/fs/smb/common/smbacl.h
new file mode 100644
index 000000000000..6a60698fc6f0
--- /dev/null
+++ b/fs/smb/common/smbacl.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: LGPL-2.1+ */
+/*
+ * Copyright (c) International Business Machines Corp., 2007
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ * Modified by Namjae Jeon (linkinjeon@kernel.org)
+ */
+
+#ifndef _COMMON_SMBACL_H
+#define _COMMON_SMBACL_H
+
+#define NUM_AUTHS (6) /* number of authority fields */
+#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
+
+/* ACE types - see MS-DTYP 2.4.4.1 */
+#define ACCESS_ALLOWED_ACE_TYPE 0x00
+#define ACCESS_DENIED_ACE_TYPE 0x01
+#define SYSTEM_AUDIT_ACE_TYPE 0x02
+#define SYSTEM_ALARM_ACE_TYPE 0x03
+#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
+#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
+#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
+#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
+#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
+#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
+#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
+#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */
+#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */
+#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11
+#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12
+#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13
+
+/* ACE flags */
+#define OBJECT_INHERIT_ACE 0x01
+#define CONTAINER_INHERIT_ACE 0x02
+#define NO_PROPAGATE_INHERIT_ACE 0x04
+#define INHERIT_ONLY_ACE 0x08
+#define INHERITED_ACE 0x10
+#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40
+#define FAILED_ACCESS_ACE_FLAG 0x80
+
+/*
+ * Maximum size of a string representation of a SID:
+ *
+ * The fields are unsigned values in decimal. So:
+ *
+ * u8: max 3 bytes in decimal
+ * u32: max 10 bytes in decimal
+ *
+ * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
+ *
+ * For authority field, max is when all 6 values are non-zero and it must be
+ * represented in hex. So "-0x" + 12 hex digits.
+ *
+ * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
+ */
+#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
+#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
+
+#define DOMAIN_USER_RID_LE cpu_to_le32(513)
+
+/*
+ * ACE types - see MS-DTYP 2.4.4.1
+ */
+enum {
+ ACCESS_ALLOWED,
+ ACCESS_DENIED,
+};
+
+/*
+ * Security ID types
+ */
+enum {
+ SIDOWNER = 1,
+ SIDGROUP,
+ SIDCREATOR_OWNER,
+ SIDCREATOR_GROUP,
+ SIDUNIX_USER,
+ SIDUNIX_GROUP,
+ SIDNFS_USER,
+ SIDNFS_GROUP,
+ SIDNFS_MODE,
+};
+
+struct smb_ntsd {
+ __le16 revision; /* revision level */
+ __le16 type;
+ __le32 osidoffset;
+ __le32 gsidoffset;
+ __le32 sacloffset;
+ __le32 dacloffset;
+} __attribute__((packed));
+
+struct smb_sid {
+ __u8 revision; /* revision level */
+ __u8 num_subauth;
+ __u8 authority[NUM_AUTHS];
+ __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
+} __attribute__((packed));
+
+/* size of a struct smb_sid, sans sub_auth array */
+#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
+
+struct smb_acl {
+ __le16 revision; /* revision level */
+ __le16 size;
+ __le32 num_aces;
+} __attribute__((packed));
+
+struct smb_ace {
+ __u8 type; /* see above and MS-DTYP 2.4.4.1 */
+ __u8 flags;
+ __le16 size;
+ __le32 access_req;
+ struct smb_sid sid; /* ie UUID of user or group who gets these perms */
+} __attribute__((packed));
+
+#endif /* _COMMON_SMBACL_H */
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 09e1e7771592..cac80e7bfefc 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -39,7 +39,8 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)
xa_destroy(&conn->sessions);
kvfree(conn->request_buf);
kfree(conn->preauth_info);
- kfree(conn);
+ if (atomic_dec_and_test(&conn->refcnt))
+ kfree(conn);
}
/**
@@ -68,6 +69,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
conn->um = NULL;
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
+ atomic_set(&conn->refcnt, 1);
conn->total_credits = 1;
conn->outstanding_credits = 0;
@@ -165,11 +167,43 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status)
up_read(&conn_list_lock);
}
-void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id)
+void ksmbd_conn_wait_idle(struct ksmbd_conn *conn)
{
wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2);
}
+int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id)
+{
+ struct ksmbd_conn *conn;
+ int rc, retry_count = 0, max_timeout = 120;
+ int rcount = 1;
+
+retry_idle:
+ if (retry_count >= max_timeout)
+ return -EIO;
+
+ down_read(&conn_list_lock);
+ list_for_each_entry(conn, &conn_list, conns_list) {
+ if (conn->binding || xa_load(&conn->sessions, sess_id)) {
+ if (conn == curr_conn)
+ rcount = 2;
+ if (atomic_read(&conn->req_running) >= rcount) {
+ rc = wait_event_timeout(conn->req_running_q,
+ atomic_read(&conn->req_running) < rcount,
+ HZ);
+ if (!rc) {
+ up_read(&conn_list_lock);
+ retry_count++;
+ goto retry_idle;
+ }
+ }
+ }
+ }
+ up_read(&conn_list_lock);
+
+ return 0;
+}
+
int ksmbd_conn_write(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 5c2845e47cf2..b379ae4fdcdf 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -106,6 +106,7 @@ struct ksmbd_conn {
bool signing_negotiated;
__le16 signing_algorithm;
bool binding;
+ atomic_t refcnt;
};
struct ksmbd_conn_ops {
@@ -145,7 +146,8 @@ extern struct list_head conn_list;
extern struct rw_semaphore conn_list_lock;
bool ksmbd_conn_alive(struct ksmbd_conn *conn);
-void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id);
+void ksmbd_conn_wait_idle(struct ksmbd_conn *conn);
+int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id);
struct ksmbd_conn *ksmbd_conn_alloc(void);
void ksmbd_conn_free(struct ksmbd_conn *conn);
bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index e0a6b758094f..d8d03070ae44 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -15,6 +15,7 @@
#include "share_config.h"
#include "user_config.h"
#include "user_session.h"
+#include "../connection.h"
#include "../transport_ipc.h"
#include "../misc.h"
@@ -120,12 +121,13 @@ static int parse_veto_list(struct ksmbd_share_config *share,
return 0;
}
-static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
+static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work,
const char *name)
{
struct ksmbd_share_config_response *resp;
struct ksmbd_share_config *share = NULL;
struct ksmbd_share_config *lookup;
+ struct unicode_map *um = work->conn->um;
int ret;
resp = ksmbd_ipc_share_config_request(name);
@@ -181,7 +183,14 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
KSMBD_SHARE_CONFIG_VETO_LIST(resp),
resp->veto_list_sz);
if (!ret && share->path) {
+ if (__ksmbd_override_fsids(work, share)) {
+ kill_share(share);
+ share = NULL;
+ goto out;
+ }
+
ret = kern_path(share->path, 0, &share->vfs_path);
+ ksmbd_revert_fsids(work);
if (ret) {
ksmbd_debug(SMB, "failed to access '%s'\n",
share->path);
@@ -214,7 +223,7 @@ out:
return share;
}
-struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work,
const char *name)
{
struct ksmbd_share_config *share;
@@ -227,7 +236,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
if (share)
return share;
- return share_config_request(um, name);
+ return share_config_request(work, name);
}
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h
index 5f591751b923..d4ac2dd4de20 100644
--- a/fs/smb/server/mgmt/share_config.h
+++ b/fs/smb/server/mgmt/share_config.h
@@ -11,6 +11,8 @@
#include <linux/path.h>
#include <linux/unicode.h>
+struct ksmbd_work;
+
struct ksmbd_share_config {
char *name;
char *path;
@@ -68,7 +70,7 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
__ksmbd_share_config_put(share);
}
-struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work,
const char *name);
bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
const char *filename);
diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c
index d2c81a8a11dd..94a52a75014a 100644
--- a/fs/smb/server/mgmt/tree_connect.c
+++ b/fs/smb/server/mgmt/tree_connect.c
@@ -16,17 +16,18 @@
#include "user_session.h"
struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
- const char *share_name)
+ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name)
{
struct ksmbd_tree_conn_status status = {-ENOENT, NULL};
struct ksmbd_tree_connect_response *resp = NULL;
struct ksmbd_share_config *sc;
struct ksmbd_tree_connect *tree_conn = NULL;
struct sockaddr *peer_addr;
+ struct ksmbd_conn *conn = work->conn;
+ struct ksmbd_session *sess = work->sess;
int ret;
- sc = ksmbd_share_config_get(conn->um, share_name);
+ sc = ksmbd_share_config_get(work, share_name);
if (!sc)
return status;
@@ -61,7 +62,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
struct ksmbd_share_config *new_sc;
ksmbd_share_config_del(sc);
- new_sc = ksmbd_share_config_get(conn->um, share_name);
+ new_sc = ksmbd_share_config_get(work, share_name);
if (!new_sc) {
pr_err("Failed to update stale share config\n");
status.ret = -ESTALE;
diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h
index 6377a70b811c..a42cdd051041 100644
--- a/fs/smb/server/mgmt/tree_connect.h
+++ b/fs/smb/server/mgmt/tree_connect.h
@@ -13,6 +13,7 @@
struct ksmbd_share_config;
struct ksmbd_user;
struct ksmbd_conn;
+struct ksmbd_work;
enum {
TREE_NEW = 0,
@@ -50,8 +51,7 @@ static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn,
struct ksmbd_session;
struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
- const char *share_name);
+ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name);
void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon);
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 162a12685d2c..99416ce9f501 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -311,6 +311,7 @@ void destroy_previous_session(struct ksmbd_conn *conn,
{
struct ksmbd_session *prev_sess;
struct ksmbd_user *prev_user;
+ int err;
down_write(&sessions_table_lock);
down_write(&conn->session_lock);
@@ -325,8 +326,16 @@ void destroy_previous_session(struct ksmbd_conn *conn,
memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
goto out;
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT);
+ err = ksmbd_conn_wait_idle_sess_id(conn, id);
+ if (err) {
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+ goto out;
+ }
+
ksmbd_destroy_file_table(&prev_sess->file_table);
prev_sess->state = SMB2_SESSION_EXPIRED;
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
ksmbd_launch_ksmbd_durable_scavenger();
out:
up_write(&conn->session_lock);
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index a8f52c4ebbda..246cde380dfb 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -10,7 +10,7 @@
#include "oplock.h"
#include "smb_common.h"
-#include "smbstatus.h"
+#include "../common/smb2status.h"
#include "connection.h"
#include "mgmt/user_session.h"
#include "mgmt/share_config.h"
@@ -51,6 +51,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
init_waitqueue_head(&opinfo->oplock_brk);
atomic_set(&opinfo->refcount, 1);
atomic_set(&opinfo->breaking_cnt, 0);
+ atomic_inc(&opinfo->conn->refcnt);
return opinfo;
}
@@ -124,6 +125,8 @@ static void free_opinfo(struct oplock_info *opinfo)
{
if (opinfo->is_lease)
free_lease(opinfo);
+ if (opinfo->conn && atomic_dec_and_test(&opinfo->conn->refcnt))
+ kfree(opinfo->conn);
kfree(opinfo);
}
@@ -163,9 +166,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
!atomic_inc_not_zero(&opinfo->refcount))
opinfo = NULL;
else {
- atomic_inc(&opinfo->conn->r_count);
if (ksmbd_conn_releasing(opinfo->conn)) {
- atomic_dec(&opinfo->conn->r_count);
atomic_dec(&opinfo->refcount);
opinfo = NULL;
}
@@ -177,26 +178,11 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
return opinfo;
}
-static void opinfo_conn_put(struct oplock_info *opinfo)
+void opinfo_put(struct oplock_info *opinfo)
{
- struct ksmbd_conn *conn;
-
if (!opinfo)
return;
- conn = opinfo->conn;
- /*
- * Checking waitqueue to dropping pending requests on
- * disconnection. waitqueue_active is safe because it
- * uses atomic operation for condition.
- */
- if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
- wake_up(&conn->r_count_q);
- opinfo_put(opinfo);
-}
-
-void opinfo_put(struct oplock_info *opinfo)
-{
if (!atomic_dec_and_test(&opinfo->refcount))
return;
@@ -1127,14 +1113,11 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
if (!atomic_inc_not_zero(&opinfo->refcount))
continue;
- atomic_inc(&opinfo->conn->r_count);
- if (ksmbd_conn_releasing(opinfo->conn)) {
- atomic_dec(&opinfo->conn->r_count);
+ if (ksmbd_conn_releasing(opinfo->conn))
continue;
- }
oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
- opinfo_conn_put(opinfo);
+ opinfo_put(opinfo);
}
}
up_read(&p_ci->m_lock);
@@ -1167,13 +1150,10 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
if (!atomic_inc_not_zero(&opinfo->refcount))
continue;
- atomic_inc(&opinfo->conn->r_count);
- if (ksmbd_conn_releasing(opinfo->conn)) {
- atomic_dec(&opinfo->conn->r_count);
+ if (ksmbd_conn_releasing(opinfo->conn))
continue;
- }
oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
- opinfo_conn_put(opinfo);
+ opinfo_put(opinfo);
}
}
up_read(&p_ci->m_lock);
@@ -1252,7 +1232,7 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
prev_opinfo = opinfo_get_list(ci);
if (!prev_opinfo ||
(prev_opinfo->level == SMB2_OPLOCK_LEVEL_NONE && lctx)) {
- opinfo_conn_put(prev_opinfo);
+ opinfo_put(prev_opinfo);
goto set_lev;
}
prev_op_has_lease = prev_opinfo->is_lease;
@@ -1262,19 +1242,19 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
if (share_ret < 0 &&
prev_opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
err = share_ret;
- opinfo_conn_put(prev_opinfo);
+ opinfo_put(prev_opinfo);
goto err_out;
}
if (prev_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH &&
prev_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
- opinfo_conn_put(prev_opinfo);
+ opinfo_put(prev_opinfo);
goto op_break_not_needed;
}
list_add(&work->interim_entry, &prev_opinfo->interim_list);
err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II);
- opinfo_conn_put(prev_opinfo);
+ opinfo_put(prev_opinfo);
if (err == -ENOENT)
goto set_lev;
/* Check all oplock was freed by close */
@@ -1337,14 +1317,14 @@ static void smb_break_all_write_oplock(struct ksmbd_work *work,
return;
if (brk_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH &&
brk_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
- opinfo_conn_put(brk_opinfo);
+ opinfo_put(brk_opinfo);
return;
}
brk_opinfo->open_trunc = is_trunc;
list_add(&work->interim_entry, &brk_opinfo->interim_list);
oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II);
- opinfo_conn_put(brk_opinfo);
+ opinfo_put(brk_opinfo);
}
/**
@@ -1376,11 +1356,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
if (!atomic_inc_not_zero(&brk_op->refcount))
continue;
- atomic_inc(&brk_op->conn->r_count);
- if (ksmbd_conn_releasing(brk_op->conn)) {
- atomic_dec(&brk_op->conn->r_count);
+ if (ksmbd_conn_releasing(brk_op->conn))
continue;
- }
rcu_read_unlock();
if (brk_op->is_lease && (brk_op->o_lease->state &
@@ -1411,7 +1388,7 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
brk_op->open_trunc = is_trunc;
oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE);
next:
- opinfo_conn_put(brk_op);
+ opinfo_put(brk_op);
rcu_read_lock();
}
rcu_read_unlock();
@@ -1510,7 +1487,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
* parse_lease_state() - parse lease context containted in file open request
* @open_req: buffer containing smb2 file open(create) request
*
- * Return: oplock state, -ENOENT if create lease context not found
+ * Return: allocated lease context object on success, otherwise NULL
*/
struct lease_ctx_info *parse_lease_state(void *open_req)
{
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 4d24cc105ef6..c402d4abe826 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -15,7 +15,7 @@
#include "server.h"
#include "smb_common.h"
-#include "smbstatus.h"
+#include "../common/smb2status.h"
#include "connection.h"
#include "transport_ipc.h"
#include "mgmt/user_session.h"
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 727cb49926ee..ae501024665e 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -7,7 +7,7 @@
#include "glob.h"
#include "nterr.h"
#include "smb_common.h"
-#include "smbstatus.h"
+#include "../common/smb2status.h"
#include "mgmt/user_session.h"
#include "connection.h"
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 37a39ab4ee65..e6bdc1b20727 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -30,7 +30,7 @@
#include "server.h"
#include "smb_common.h"
-#include "smbstatus.h"
+#include "../common/smb2status.h"
#include "ksmbd_work.h"
#include "mgmt/user_config.h"
#include "mgmt/share_config.h"
@@ -519,7 +519,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work)
* smb2_allocate_rsp_buf() - allocate smb2 response buffer
* @work: smb work containing smb request buffer
*
- * Return: 0 on success, otherwise -ENOMEM
+ * Return: 0 on success, otherwise error
*/
int smb2_allocate_rsp_buf(struct ksmbd_work *work)
{
@@ -1370,7 +1370,8 @@ static int ntlm_negotiate(struct ksmbd_work *work,
}
sz = le16_to_cpu(rsp->SecurityBufferOffset);
- memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+ unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len,
+ /* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
out:
@@ -1453,7 +1454,9 @@ static int ntlm_authenticate(struct ksmbd_work *work,
return -ENOMEM;
sz = le16_to_cpu(rsp->SecurityBufferOffset);
- memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+ unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob,
+ spnego_blob_len,
+ /* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
kfree(spnego_blob);
}
@@ -1687,6 +1690,8 @@ int smb2_sess_setup(struct ksmbd_work *work)
rc = ksmbd_session_register(conn, sess);
if (rc)
goto out_err;
+
+ conn->binding = false;
} else if (conn->dialect >= SMB30_PROT_ID &&
(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) &&
req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) {
@@ -1765,6 +1770,8 @@ int smb2_sess_setup(struct ksmbd_work *work)
sess = NULL;
goto out_err;
}
+
+ conn->binding = false;
}
work->sess = sess;
@@ -1955,7 +1962,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n",
name, treename);
- status = ksmbd_tree_conn_connect(conn, sess, name);
+ status = ksmbd_tree_conn_connect(work, name);
if (status.ret == KSMBD_TREE_CONN_STATUS_OK)
rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id);
else
@@ -2210,7 +2217,7 @@ int smb2_session_logoff(struct ksmbd_work *work)
ksmbd_conn_unlock(conn);
ksmbd_close_session_fds(work);
- ksmbd_conn_wait_idle(conn, sess_id);
+ ksmbd_conn_wait_idle(conn);
/*
* Re-lookup session to validate if session is deleted
@@ -2767,8 +2774,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work,
}
}
- if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
- req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+ if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+ req_op_level == SMB2_OPLOCK_LEVEL_BATCH) {
dh_info->CreateGuid =
durable_v2_blob->CreateGuid;
dh_info->persistent =
@@ -2788,8 +2795,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work,
goto out;
}
- if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
- req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+ if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+ req_op_level == SMB2_OPLOCK_LEVEL_BATCH) {
ksmbd_debug(SMB, "Request for durable open\n");
dh_info->type = dh_idx;
}
@@ -3093,7 +3100,6 @@ int smb2_open(struct ksmbd_work *work)
goto err_out;
}
- file_present = true;
idmap = mnt_idmap(path.mnt);
} else {
if (rc != -ENOENT)
@@ -3411,7 +3417,7 @@ int smb2_open(struct ksmbd_work *work)
goto err_out1;
}
} else {
- if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+ if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && lc) {
if (S_ISDIR(file_inode(filp)->i_mode)) {
lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE;
lc->is_dir = true;
@@ -3710,7 +3716,7 @@ err_out2:
kfree(name);
kfree(lc);
- return 0;
+ return rc;
}
static int readdir_info_level_struct_sz(int info_level)
@@ -4406,7 +4412,8 @@ int smb2_query_dir(struct ksmbd_work *work)
rsp->OutputBufferLength = cpu_to_le32(0);
rsp->Buffer[0] = 0;
rc = ksmbd_iov_pin_rsp(work, (void *)rsp,
- sizeof(struct smb2_query_directory_rsp));
+ offsetof(struct smb2_query_directory_rsp, Buffer)
+ + 1);
if (rc)
goto err_out;
} else {
@@ -5357,7 +5364,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
"NTFS", PATH_MAX, conn->local_nls, 0);
len = len * 2;
info->FileSystemNameLen = cpu_to_le32(len);
- sz = sizeof(struct filesystem_attribute_info) - 2 + len;
+ sz = sizeof(struct filesystem_attribute_info) + len;
rsp->OutputBufferLength = cpu_to_le32(sz);
break;
}
@@ -5383,7 +5390,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
len = len * 2;
info->VolumeLabelSize = cpu_to_le32(len);
info->Reserved = 0;
- sz = sizeof(struct filesystem_vol_info) - 2 + len;
+ sz = sizeof(struct filesystem_vol_info) + len;
rsp->OutputBufferLength = cpu_to_le32(sz);
break;
}
@@ -5596,6 +5603,11 @@ int smb2_query_info(struct ksmbd_work *work)
ksmbd_debug(SMB, "GOT query info request\n");
+ if (ksmbd_override_fsids(work)) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
switch (req->InfoType) {
case SMB2_O_INFO_FILE:
ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n");
@@ -5614,6 +5626,7 @@ int smb2_query_info(struct ksmbd_work *work)
req->InfoType);
rc = -EOPNOTSUPP;
}
+ ksmbd_revert_fsids(work);
if (!rc) {
rsp->StructureSize = cpu_to_le16(9);
@@ -5623,6 +5636,7 @@ int smb2_query_info(struct ksmbd_work *work)
le32_to_cpu(rsp->OutputBufferLength));
}
+err_out:
if (rc < 0) {
if (rc == -EACCES)
rsp->hdr.Status = STATUS_ACCESS_DENIED;
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 474dadf6b7b8..cc4bb2377cbd 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -9,7 +9,7 @@
#include "smb_common.h"
#include "server.h"
#include "misc.h"
-#include "smbstatus.h"
+#include "../common/smb2status.h"
#include "connection.h"
#include "ksmbd_work.h"
#include "mgmt/user_session.h"
@@ -732,10 +732,10 @@ bool is_asterisk(char *p)
return p && p[0] == '*';
}
-int ksmbd_override_fsids(struct ksmbd_work *work)
+int __ksmbd_override_fsids(struct ksmbd_work *work,
+ struct ksmbd_share_config *share)
{
struct ksmbd_session *sess = work->sess;
- struct ksmbd_share_config *share = work->tcon->share_conf;
struct cred *cred;
struct group_info *gi;
unsigned int uid;
@@ -775,6 +775,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
return 0;
}
+int ksmbd_override_fsids(struct ksmbd_work *work)
+{
+ return __ksmbd_override_fsids(work, work->tcon->share_conf);
+}
+
void ksmbd_revert_fsids(struct ksmbd_work *work)
{
const struct cred *cred;
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index f1092519c0c2..cc1d6dfe29d5 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -213,7 +213,7 @@ struct filesystem_attribute_info {
__le32 Attributes;
__le32 MaxPathNameComponentLength;
__le32 FileSystemNameLen;
- __le16 FileSystemName[1]; /* do not have to save this - get subset? */
+ __le16 FileSystemName[]; /* do not have to save this - get subset? */
} __packed;
struct filesystem_device_info {
@@ -226,7 +226,7 @@ struct filesystem_vol_info {
__le32 SerialNumber;
__le32 VolumeLabelSize;
__le16 Reserved;
- __le16 VolumeLabel[1];
+ __le16 VolumeLabel[];
} __packed;
struct filesystem_info {
@@ -447,6 +447,8 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn,
int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command);
int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp);
+int __ksmbd_override_fsids(struct ksmbd_work *work,
+ struct ksmbd_share_config *share);
int ksmbd_override_fsids(struct ksmbd_work *work);
void ksmbd_revert_fsids(struct ksmbd_work *work);
diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h
index 2b52861707d8..24ce576fc292 100644
--- a/fs/smb/server/smbacl.h
+++ b/fs/smb/server/smbacl.h
@@ -8,6 +8,7 @@
#ifndef _SMBACL_H
#define _SMBACL_H
+#include "../common/smbacl.h"
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/posix_acl.h>
@@ -15,32 +16,6 @@
#include "mgmt/tree_connect.h"
-#define NUM_AUTHS (6) /* number of authority fields */
-#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
-
-/*
- * ACE types - see MS-DTYP 2.4.4.1
- */
-enum {
- ACCESS_ALLOWED,
- ACCESS_DENIED,
-};
-
-/*
- * Security ID types
- */
-enum {
- SIDOWNER = 1,
- SIDGROUP,
- SIDCREATOR_OWNER,
- SIDCREATOR_GROUP,
- SIDUNIX_USER,
- SIDUNIX_GROUP,
- SIDNFS_USER,
- SIDNFS_GROUP,
- SIDNFS_MODE,
-};
-
/* Revision for ACLs */
#define SD_REVISION 1
@@ -62,92 +37,8 @@ enum {
#define RM_CONTROL_VALID 0x4000
#define SELF_RELATIVE 0x8000
-/* ACE types - see MS-DTYP 2.4.4.1 */
-#define ACCESS_ALLOWED_ACE_TYPE 0x00
-#define ACCESS_DENIED_ACE_TYPE 0x01
-#define SYSTEM_AUDIT_ACE_TYPE 0x02
-#define SYSTEM_ALARM_ACE_TYPE 0x03
-#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
-#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
-#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
-#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
-#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
-#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
-#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
-#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
-#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
-#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
-#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */
-#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
-#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */
-#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11
-#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12
-#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13
-
-/* ACE flags */
-#define OBJECT_INHERIT_ACE 0x01
-#define CONTAINER_INHERIT_ACE 0x02
-#define NO_PROPAGATE_INHERIT_ACE 0x04
-#define INHERIT_ONLY_ACE 0x08
-#define INHERITED_ACE 0x10
-#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40
-#define FAILED_ACCESS_ACE_FLAG 0x80
-
-/*
- * Maximum size of a string representation of a SID:
- *
- * The fields are unsigned values in decimal. So:
- *
- * u8: max 3 bytes in decimal
- * u32: max 10 bytes in decimal
- *
- * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
- *
- * For authority field, max is when all 6 values are non-zero and it must be
- * represented in hex. So "-0x" + 12 hex digits.
- *
- * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
- */
-#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
-#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
-
-#define DOMAIN_USER_RID_LE cpu_to_le32(513)
-
struct ksmbd_conn;
-struct smb_ntsd {
- __le16 revision; /* revision level */
- __le16 type;
- __le32 osidoffset;
- __le32 gsidoffset;
- __le32 sacloffset;
- __le32 dacloffset;
-} __packed;
-
-struct smb_sid {
- __u8 revision; /* revision level */
- __u8 num_subauth;
- __u8 authority[NUM_AUTHS];
- __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
-} __packed;
-
-/* size of a struct cifs_sid, sans sub_auth array */
-#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
-
-struct smb_acl {
- __le16 revision; /* revision level */
- __le16 size;
- __le32 num_aces;
-} __packed;
-
-struct smb_ace {
- __u8 type;
- __u8 flags;
- __le16 size;
- __le32 access_req;
- struct smb_sid sid; /* ie UUID of user or group who gets these perms */
-} __packed;
-
struct smb_fattr {
kuid_t cf_uid;
kgid_t cf_gid;
diff --git a/fs/smb/server/smbstatus.h b/fs/smb/server/smbstatus.h
deleted file mode 100644
index 8963deb42404..000000000000
--- a/fs/smb/server/smbstatus.h
+++ /dev/null
@@ -1,1822 +0,0 @@
-/* SPDX-License-Identifier: LGPL-2.1+ */
-/*
- * fs/server/smb2status.h
- *
- * SMB2 Status code (network error) definitions
- * Definitions are from MS-ERREF
- *
- * Copyright (c) International Business Machines Corp., 2009,2011
- * Author(s): Steve French (sfrench@us.ibm.com)
- */
-
-/*
- * 0 1 2 3 4 5 6 7 8 9 0 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
- * SEV C N <-------Facility--------> <------Error Status Code------>
- *
- * C is set if "customer defined" error, N bit is reserved and MBZ
- */
-
-#define STATUS_SEVERITY_SUCCESS cpu_to_le32(0x0000)
-#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001)
-#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002)
-#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003)
-
-struct ntstatus {
- /* Facility is the high 12 bits of the following field */
- __le32 Facility; /* low 2 bits Severity, next is Customer, then rsrvd */
- __le32 Code;
-};
-
-#define STATUS_SUCCESS 0x00000000
-#define STATUS_WAIT_0 cpu_to_le32(0x00000000)
-#define STATUS_WAIT_1 cpu_to_le32(0x00000001)
-#define STATUS_WAIT_2 cpu_to_le32(0x00000002)
-#define STATUS_WAIT_3 cpu_to_le32(0x00000003)
-#define STATUS_WAIT_63 cpu_to_le32(0x0000003F)
-#define STATUS_ABANDONED cpu_to_le32(0x00000080)
-#define STATUS_ABANDONED_WAIT_0 cpu_to_le32(0x00000080)
-#define STATUS_ABANDONED_WAIT_63 cpu_to_le32(0x000000BF)
-#define STATUS_USER_APC cpu_to_le32(0x000000C0)
-#define STATUS_KERNEL_APC cpu_to_le32(0x00000100)
-#define STATUS_ALERTED cpu_to_le32(0x00000101)
-#define STATUS_TIMEOUT cpu_to_le32(0x00000102)
-#define STATUS_PENDING cpu_to_le32(0x00000103)
-#define STATUS_REPARSE cpu_to_le32(0x00000104)
-#define STATUS_MORE_ENTRIES cpu_to_le32(0x00000105)
-#define STATUS_NOT_ALL_ASSIGNED cpu_to_le32(0x00000106)
-#define STATUS_SOME_NOT_MAPPED cpu_to_le32(0x00000107)
-#define STATUS_OPLOCK_BREAK_IN_PROGRESS cpu_to_le32(0x00000108)
-#define STATUS_VOLUME_MOUNTED cpu_to_le32(0x00000109)
-#define STATUS_RXACT_COMMITTED cpu_to_le32(0x0000010A)
-#define STATUS_NOTIFY_CLEANUP cpu_to_le32(0x0000010B)
-#define STATUS_NOTIFY_ENUM_DIR cpu_to_le32(0x0000010C)
-#define STATUS_NO_QUOTAS_FOR_ACCOUNT cpu_to_le32(0x0000010D)
-#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED cpu_to_le32(0x0000010E)
-#define STATUS_PAGE_FAULT_TRANSITION cpu_to_le32(0x00000110)
-#define STATUS_PAGE_FAULT_DEMAND_ZERO cpu_to_le32(0x00000111)
-#define STATUS_PAGE_FAULT_COPY_ON_WRITE cpu_to_le32(0x00000112)
-#define STATUS_PAGE_FAULT_GUARD_PAGE cpu_to_le32(0x00000113)
-#define STATUS_PAGE_FAULT_PAGING_FILE cpu_to_le32(0x00000114)
-#define STATUS_CACHE_PAGE_LOCKED cpu_to_le32(0x00000115)
-#define STATUS_CRASH_DUMP cpu_to_le32(0x00000116)
-#define STATUS_BUFFER_ALL_ZEROS cpu_to_le32(0x00000117)
-#define STATUS_REPARSE_OBJECT cpu_to_le32(0x00000118)
-#define STATUS_RESOURCE_REQUIREMENTS_CHANGED cpu_to_le32(0x00000119)
-#define STATUS_TRANSLATION_COMPLETE cpu_to_le32(0x00000120)
-#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY cpu_to_le32(0x00000121)
-#define STATUS_NOTHING_TO_TERMINATE cpu_to_le32(0x00000122)
-#define STATUS_PROCESS_NOT_IN_JOB cpu_to_le32(0x00000123)
-#define STATUS_PROCESS_IN_JOB cpu_to_le32(0x00000124)
-#define STATUS_VOLSNAP_HIBERNATE_READY cpu_to_le32(0x00000125)
-#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY cpu_to_le32(0x00000126)
-#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED cpu_to_le32(0x00000127)
-#define STATUS_INTERRUPT_STILL_CONNECTED cpu_to_le32(0x00000128)
-#define STATUS_PROCESS_CLONED cpu_to_le32(0x00000129)
-#define STATUS_FILE_LOCKED_WITH_ONLY_READERS cpu_to_le32(0x0000012A)
-#define STATUS_FILE_LOCKED_WITH_WRITERS cpu_to_le32(0x0000012B)
-#define STATUS_RESOURCEMANAGER_READ_ONLY cpu_to_le32(0x00000202)
-#define STATUS_WAIT_FOR_OPLOCK cpu_to_le32(0x00000367)
-#define DBG_EXCEPTION_HANDLED cpu_to_le32(0x00010001)
-#define DBG_CONTINUE cpu_to_le32(0x00010002)
-#define STATUS_FLT_IO_COMPLETE cpu_to_le32(0x001C0001)
-#define STATUS_OBJECT_NAME_EXISTS cpu_to_le32(0x40000000)
-#define STATUS_THREAD_WAS_SUSPENDED cpu_to_le32(0x40000001)
-#define STATUS_WORKING_SET_LIMIT_RANGE cpu_to_le32(0x40000002)
-#define STATUS_IMAGE_NOT_AT_BASE cpu_to_le32(0x40000003)
-#define STATUS_RXACT_STATE_CREATED cpu_to_le32(0x40000004)
-#define STATUS_SEGMENT_NOTIFICATION cpu_to_le32(0x40000005)
-#define STATUS_LOCAL_USER_SESSION_KEY cpu_to_le32(0x40000006)
-#define STATUS_BAD_CURRENT_DIRECTORY cpu_to_le32(0x40000007)
-#define STATUS_SERIAL_MORE_WRITES cpu_to_le32(0x40000008)
-#define STATUS_REGISTRY_RECOVERED cpu_to_le32(0x40000009)
-#define STATUS_FT_READ_RECOVERY_FROM_BACKUP cpu_to_le32(0x4000000A)
-#define STATUS_FT_WRITE_RECOVERY cpu_to_le32(0x4000000B)
-#define STATUS_SERIAL_COUNTER_TIMEOUT cpu_to_le32(0x4000000C)
-#define STATUS_NULL_LM_PASSWORD cpu_to_le32(0x4000000D)
-#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH cpu_to_le32(0x4000000E)
-#define STATUS_RECEIVE_PARTIAL cpu_to_le32(0x4000000F)
-#define STATUS_RECEIVE_EXPEDITED cpu_to_le32(0x40000010)
-#define STATUS_RECEIVE_PARTIAL_EXPEDITED cpu_to_le32(0x40000011)
-#define STATUS_EVENT_DONE cpu_to_le32(0x40000012)
-#define STATUS_EVENT_PENDING cpu_to_le32(0x40000013)
-#define STATUS_CHECKING_FILE_SYSTEM cpu_to_le32(0x40000014)
-#define STATUS_FATAL_APP_EXIT cpu_to_le32(0x40000015)
-#define STATUS_PREDEFINED_HANDLE cpu_to_le32(0x40000016)
-#define STATUS_WAS_UNLOCKED cpu_to_le32(0x40000017)
-#define STATUS_SERVICE_NOTIFICATION cpu_to_le32(0x40000018)
-#define STATUS_WAS_LOCKED cpu_to_le32(0x40000019)
-#define STATUS_LOG_HARD_ERROR cpu_to_le32(0x4000001A)
-#define STATUS_ALREADY_WIN32 cpu_to_le32(0x4000001B)
-#define STATUS_WX86_UNSIMULATE cpu_to_le32(0x4000001C)
-#define STATUS_WX86_CONTINUE cpu_to_le32(0x4000001D)
-#define STATUS_WX86_SINGLE_STEP cpu_to_le32(0x4000001E)
-#define STATUS_WX86_BREAKPOINT cpu_to_le32(0x4000001F)
-#define STATUS_WX86_EXCEPTION_CONTINUE cpu_to_le32(0x40000020)
-#define STATUS_WX86_EXCEPTION_LASTCHANCE cpu_to_le32(0x40000021)
-#define STATUS_WX86_EXCEPTION_CHAIN cpu_to_le32(0x40000022)
-#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE cpu_to_le32(0x40000023)
-#define STATUS_NO_YIELD_PERFORMED cpu_to_le32(0x40000024)
-#define STATUS_TIMER_RESUME_IGNORED cpu_to_le32(0x40000025)
-#define STATUS_ARBITRATION_UNHANDLED cpu_to_le32(0x40000026)
-#define STATUS_CARDBUS_NOT_SUPPORTED cpu_to_le32(0x40000027)
-#define STATUS_WX86_CREATEWX86TIB cpu_to_le32(0x40000028)
-#define STATUS_MP_PROCESSOR_MISMATCH cpu_to_le32(0x40000029)
-#define STATUS_HIBERNATED cpu_to_le32(0x4000002A)
-#define STATUS_RESUME_HIBERNATION cpu_to_le32(0x4000002B)
-#define STATUS_FIRMWARE_UPDATED cpu_to_le32(0x4000002C)
-#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES cpu_to_le32(0x4000002D)
-#define STATUS_MESSAGE_RETRIEVED cpu_to_le32(0x4000002E)
-#define STATUS_SYSTEM_POWERSTATE_TRANSITION cpu_to_le32(0x4000002F)
-#define STATUS_ALPC_CHECK_COMPLETION_LIST cpu_to_le32(0x40000030)
-#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION cpu_to_le32(0x40000031)
-#define STATUS_ACCESS_AUDIT_BY_POLICY cpu_to_le32(0x40000032)
-#define STATUS_ABANDON_HIBERFILE cpu_to_le32(0x40000033)
-#define STATUS_BIZRULES_NOT_ENABLED cpu_to_le32(0x40000034)
-#define STATUS_WAKE_SYSTEM cpu_to_le32(0x40000294)
-#define STATUS_DS_SHUTTING_DOWN cpu_to_le32(0x40000370)
-#define DBG_REPLY_LATER cpu_to_le32(0x40010001)
-#define DBG_UNABLE_TO_PROVIDE_HANDLE cpu_to_le32(0x40010002)
-#define DBG_TERMINATE_THREAD cpu_to_le32(0x40010003)
-#define DBG_TERMINATE_PROCESS cpu_to_le32(0x40010004)
-#define DBG_CONTROL_C cpu_to_le32(0x40010005)
-#define DBG_PRINTEXCEPTION_C cpu_to_le32(0x40010006)
-#define DBG_RIPEXCEPTION cpu_to_le32(0x40010007)
-#define DBG_CONTROL_BREAK cpu_to_le32(0x40010008)
-#define DBG_COMMAND_EXCEPTION cpu_to_le32(0x40010009)
-#define RPC_NT_UUID_LOCAL_ONLY cpu_to_le32(0x40020056)
-#define RPC_NT_SEND_INCOMPLETE cpu_to_le32(0x400200AF)
-#define STATUS_CTX_CDM_CONNECT cpu_to_le32(0x400A0004)
-#define STATUS_CTX_CDM_DISCONNECT cpu_to_le32(0x400A0005)
-#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT cpu_to_le32(0x4015000D)
-#define STATUS_RECOVERY_NOT_NEEDED cpu_to_le32(0x40190034)
-#define STATUS_RM_ALREADY_STARTED cpu_to_le32(0x40190035)
-#define STATUS_LOG_NO_RESTART cpu_to_le32(0x401A000C)
-#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST cpu_to_le32(0x401B00EC)
-#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED cpu_to_le32(0x401E000A)
-#define STATUS_GRAPHICS_DRIVER_MISMATCH cpu_to_le32(0x401E0117)
-#define STATUS_GRAPHICS_MODE_NOT_PINNED cpu_to_le32(0x401E0307)
-#define STATUS_GRAPHICS_NO_PREFERRED_MODE cpu_to_le32(0x401E031E)
-#define STATUS_GRAPHICS_DATASET_IS_EMPTY cpu_to_le32(0x401E034B)
-#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET cpu_to_le32(0x401E034C)
-#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED \
- cpu_to_le32(0x401E0351)
-#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS cpu_to_le32(0x401E042F)
-#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED cpu_to_le32(0x401E0437)
-#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY cpu_to_le32(0x401E0439)
-#define STATUS_GRAPHICS_START_DEFERRED cpu_to_le32(0x401E043A)
-#define STATUS_NDIS_INDICATION_REQUIRED cpu_to_le32(0x40230001)
-#define STATUS_GUARD_PAGE_VIOLATION cpu_to_le32(0x80000001)
-#define STATUS_DATATYPE_MISALIGNMENT cpu_to_le32(0x80000002)
-#define STATUS_BREAKPOINT cpu_to_le32(0x80000003)
-#define STATUS_SINGLE_STEP cpu_to_le32(0x80000004)
-#define STATUS_BUFFER_OVERFLOW cpu_to_le32(0x80000005)
-#define STATUS_NO_MORE_FILES cpu_to_le32(0x80000006)
-#define STATUS_WAKE_SYSTEM_DEBUGGER cpu_to_le32(0x80000007)
-#define STATUS_HANDLES_CLOSED cpu_to_le32(0x8000000A)
-#define STATUS_NO_INHERITANCE cpu_to_le32(0x8000000B)
-#define STATUS_GUID_SUBSTITUTION_MADE cpu_to_le32(0x8000000C)
-#define STATUS_PARTIAL_COPY cpu_to_le32(0x8000000D)
-#define STATUS_DEVICE_PAPER_EMPTY cpu_to_le32(0x8000000E)
-#define STATUS_DEVICE_POWERED_OFF cpu_to_le32(0x8000000F)
-#define STATUS_DEVICE_OFF_LINE cpu_to_le32(0x80000010)
-#define STATUS_DEVICE_BUSY cpu_to_le32(0x80000011)
-#define STATUS_NO_MORE_EAS cpu_to_le32(0x80000012)
-#define STATUS_INVALID_EA_NAME cpu_to_le32(0x80000013)
-#define STATUS_EA_LIST_INCONSISTENT cpu_to_le32(0x80000014)
-#define STATUS_INVALID_EA_FLAG cpu_to_le32(0x80000015)
-#define STATUS_VERIFY_REQUIRED cpu_to_le32(0x80000016)
-#define STATUS_EXTRANEOUS_INFORMATION cpu_to_le32(0x80000017)
-#define STATUS_RXACT_COMMIT_NECESSARY cpu_to_le32(0x80000018)
-#define STATUS_NO_MORE_ENTRIES cpu_to_le32(0x8000001A)
-#define STATUS_FILEMARK_DETECTED cpu_to_le32(0x8000001B)
-#define STATUS_MEDIA_CHANGED cpu_to_le32(0x8000001C)
-#define STATUS_BUS_RESET cpu_to_le32(0x8000001D)
-#define STATUS_END_OF_MEDIA cpu_to_le32(0x8000001E)
-#define STATUS_BEGINNING_OF_MEDIA cpu_to_le32(0x8000001F)
-#define STATUS_MEDIA_CHECK cpu_to_le32(0x80000020)
-#define STATUS_SETMARK_DETECTED cpu_to_le32(0x80000021)
-#define STATUS_NO_DATA_DETECTED cpu_to_le32(0x80000022)
-#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES cpu_to_le32(0x80000023)
-#define STATUS_SERVER_HAS_OPEN_HANDLES cpu_to_le32(0x80000024)
-#define STATUS_ALREADY_DISCONNECTED cpu_to_le32(0x80000025)
-#define STATUS_LONGJUMP cpu_to_le32(0x80000026)
-#define STATUS_CLEANER_CARTRIDGE_INSTALLED cpu_to_le32(0x80000027)
-#define STATUS_PLUGPLAY_QUERY_VETOED cpu_to_le32(0x80000028)
-#define STATUS_UNWIND_CONSOLIDATE cpu_to_le32(0x80000029)
-#define STATUS_REGISTRY_HIVE_RECOVERED cpu_to_le32(0x8000002A)
-#define STATUS_DLL_MIGHT_BE_INSECURE cpu_to_le32(0x8000002B)
-#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE cpu_to_le32(0x8000002C)
-#define STATUS_STOPPED_ON_SYMLINK cpu_to_le32(0x8000002D)
-#define STATUS_DEVICE_REQUIRES_CLEANING cpu_to_le32(0x80000288)
-#define STATUS_DEVICE_DOOR_OPEN cpu_to_le32(0x80000289)
-#define STATUS_DATA_LOST_REPAIR cpu_to_le32(0x80000803)
-#define DBG_EXCEPTION_NOT_HANDLED cpu_to_le32(0x80010001)
-#define STATUS_CLUSTER_NODE_ALREADY_UP cpu_to_le32(0x80130001)
-#define STATUS_CLUSTER_NODE_ALREADY_DOWN cpu_to_le32(0x80130002)
-#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE cpu_to_le32(0x80130003)
-#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE cpu_to_le32(0x80130004)
-#define STATUS_CLUSTER_NODE_ALREADY_MEMBER cpu_to_le32(0x80130005)
-#define STATUS_COULD_NOT_RESIZE_LOG cpu_to_le32(0x80190009)
-#define STATUS_NO_TXF_METADATA cpu_to_le32(0x80190029)
-#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN cpu_to_le32(0x80190031)
-#define STATUS_TXF_METADATA_ALREADY_PRESENT cpu_to_le32(0x80190041)
-#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET cpu_to_le32(0x80190042)
-#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED \
- cpu_to_le32(0x801B00EB)
-#define STATUS_FLT_BUFFER_TOO_SMALL cpu_to_le32(0x801C0001)
-#define STATUS_FVE_PARTIAL_METADATA cpu_to_le32(0x80210001)
-#define STATUS_UNSUCCESSFUL cpu_to_le32(0xC0000001)
-#define STATUS_NOT_IMPLEMENTED cpu_to_le32(0xC0000002)
-#define STATUS_INVALID_INFO_CLASS cpu_to_le32(0xC0000003)
-#define STATUS_INFO_LENGTH_MISMATCH cpu_to_le32(0xC0000004)
-#define STATUS_ACCESS_VIOLATION cpu_to_le32(0xC0000005)
-#define STATUS_IN_PAGE_ERROR cpu_to_le32(0xC0000006)
-#define STATUS_PAGEFILE_QUOTA cpu_to_le32(0xC0000007)
-#define STATUS_INVALID_HANDLE cpu_to_le32(0xC0000008)
-#define STATUS_BAD_INITIAL_STACK cpu_to_le32(0xC0000009)
-#define STATUS_BAD_INITIAL_PC cpu_to_le32(0xC000000A)
-#define STATUS_INVALID_CID cpu_to_le32(0xC000000B)
-#define STATUS_TIMER_NOT_CANCELED cpu_to_le32(0xC000000C)
-#define STATUS_INVALID_PARAMETER cpu_to_le32(0xC000000D)
-#define STATUS_NO_SUCH_DEVICE cpu_to_le32(0xC000000E)
-#define STATUS_NO_SUCH_FILE cpu_to_le32(0xC000000F)
-#define STATUS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0000010)
-#define STATUS_END_OF_FILE cpu_to_le32(0xC0000011)
-#define STATUS_WRONG_VOLUME cpu_to_le32(0xC0000012)
-#define STATUS_NO_MEDIA_IN_DEVICE cpu_to_le32(0xC0000013)
-#define STATUS_UNRECOGNIZED_MEDIA cpu_to_le32(0xC0000014)
-#define STATUS_NONEXISTENT_SECTOR cpu_to_le32(0xC0000015)
-#define STATUS_MORE_PROCESSING_REQUIRED cpu_to_le32(0xC0000016)
-#define STATUS_NO_MEMORY cpu_to_le32(0xC0000017)
-#define STATUS_CONFLICTING_ADDRESSES cpu_to_le32(0xC0000018)
-#define STATUS_NOT_MAPPED_VIEW cpu_to_le32(0xC0000019)
-#define STATUS_UNABLE_TO_FREE_VM cpu_to_le32(0xC000001A)
-#define STATUS_UNABLE_TO_DELETE_SECTION cpu_to_le32(0xC000001B)
-#define STATUS_INVALID_SYSTEM_SERVICE cpu_to_le32(0xC000001C)
-#define STATUS_ILLEGAL_INSTRUCTION cpu_to_le32(0xC000001D)
-#define STATUS_INVALID_LOCK_SEQUENCE cpu_to_le32(0xC000001E)
-#define STATUS_INVALID_VIEW_SIZE cpu_to_le32(0xC000001F)
-#define STATUS_INVALID_FILE_FOR_SECTION cpu_to_le32(0xC0000020)
-#define STATUS_ALREADY_COMMITTED cpu_to_le32(0xC0000021)
-#define STATUS_ACCESS_DENIED cpu_to_le32(0xC0000022)
-#define STATUS_BUFFER_TOO_SMALL cpu_to_le32(0xC0000023)
-#define STATUS_OBJECT_TYPE_MISMATCH cpu_to_le32(0xC0000024)
-#define STATUS_NONCONTINUABLE_EXCEPTION cpu_to_le32(0xC0000025)
-#define STATUS_INVALID_DISPOSITION cpu_to_le32(0xC0000026)
-#define STATUS_UNWIND cpu_to_le32(0xC0000027)
-#define STATUS_BAD_STACK cpu_to_le32(0xC0000028)
-#define STATUS_INVALID_UNWIND_TARGET cpu_to_le32(0xC0000029)
-#define STATUS_NOT_LOCKED cpu_to_le32(0xC000002A)
-#define STATUS_PARITY_ERROR cpu_to_le32(0xC000002B)
-#define STATUS_UNABLE_TO_DECOMMIT_VM cpu_to_le32(0xC000002C)
-#define STATUS_NOT_COMMITTED cpu_to_le32(0xC000002D)
-#define STATUS_INVALID_PORT_ATTRIBUTES cpu_to_le32(0xC000002E)
-#define STATUS_PORT_MESSAGE_TOO_LONG cpu_to_le32(0xC000002F)
-#define STATUS_INVALID_PARAMETER_MIX cpu_to_le32(0xC0000030)
-#define STATUS_INVALID_QUOTA_LOWER cpu_to_le32(0xC0000031)
-#define STATUS_DISK_CORRUPT_ERROR cpu_to_le32(0xC0000032)
-#define STATUS_OBJECT_NAME_INVALID cpu_to_le32(0xC0000033)
-#define STATUS_OBJECT_NAME_NOT_FOUND cpu_to_le32(0xC0000034)
-#define STATUS_OBJECT_NAME_COLLISION cpu_to_le32(0xC0000035)
-#define STATUS_PORT_DISCONNECTED cpu_to_le32(0xC0000037)
-#define STATUS_DEVICE_ALREADY_ATTACHED cpu_to_le32(0xC0000038)
-#define STATUS_OBJECT_PATH_INVALID cpu_to_le32(0xC0000039)
-#define STATUS_OBJECT_PATH_NOT_FOUND cpu_to_le32(0xC000003A)
-#define STATUS_OBJECT_PATH_SYNTAX_BAD cpu_to_le32(0xC000003B)
-#define STATUS_DATA_OVERRUN cpu_to_le32(0xC000003C)
-#define STATUS_DATA_LATE_ERROR cpu_to_le32(0xC000003D)
-#define STATUS_DATA_ERROR cpu_to_le32(0xC000003E)
-#define STATUS_CRC_ERROR cpu_to_le32(0xC000003F)
-#define STATUS_SECTION_TOO_BIG cpu_to_le32(0xC0000040)
-#define STATUS_PORT_CONNECTION_REFUSED cpu_to_le32(0xC0000041)
-#define STATUS_INVALID_PORT_HANDLE cpu_to_le32(0xC0000042)
-#define STATUS_SHARING_VIOLATION cpu_to_le32(0xC0000043)
-#define STATUS_QUOTA_EXCEEDED cpu_to_le32(0xC0000044)
-#define STATUS_INVALID_PAGE_PROTECTION cpu_to_le32(0xC0000045)
-#define STATUS_MUTANT_NOT_OWNED cpu_to_le32(0xC0000046)
-#define STATUS_SEMAPHORE_LIMIT_EXCEEDED cpu_to_le32(0xC0000047)
-#define STATUS_PORT_ALREADY_SET cpu_to_le32(0xC0000048)
-#define STATUS_SECTION_NOT_IMAGE cpu_to_le32(0xC0000049)
-#define STATUS_SUSPEND_COUNT_EXCEEDED cpu_to_le32(0xC000004A)
-#define STATUS_THREAD_IS_TERMINATING cpu_to_le32(0xC000004B)
-#define STATUS_BAD_WORKING_SET_LIMIT cpu_to_le32(0xC000004C)
-#define STATUS_INCOMPATIBLE_FILE_MAP cpu_to_le32(0xC000004D)
-#define STATUS_SECTION_PROTECTION cpu_to_le32(0xC000004E)
-#define STATUS_EAS_NOT_SUPPORTED cpu_to_le32(0xC000004F)
-#define STATUS_EA_TOO_LARGE cpu_to_le32(0xC0000050)
-#define STATUS_NONEXISTENT_EA_ENTRY cpu_to_le32(0xC0000051)
-#define STATUS_NO_EAS_ON_FILE cpu_to_le32(0xC0000052)
-#define STATUS_EA_CORRUPT_ERROR cpu_to_le32(0xC0000053)
-#define STATUS_FILE_LOCK_CONFLICT cpu_to_le32(0xC0000054)
-#define STATUS_LOCK_NOT_GRANTED cpu_to_le32(0xC0000055)
-#define STATUS_DELETE_PENDING cpu_to_le32(0xC0000056)
-#define STATUS_CTL_FILE_NOT_SUPPORTED cpu_to_le32(0xC0000057)
-#define STATUS_UNKNOWN_REVISION cpu_to_le32(0xC0000058)
-#define STATUS_REVISION_MISMATCH cpu_to_le32(0xC0000059)
-#define STATUS_INVALID_OWNER cpu_to_le32(0xC000005A)
-#define STATUS_INVALID_PRIMARY_GROUP cpu_to_le32(0xC000005B)
-#define STATUS_NO_IMPERSONATION_TOKEN cpu_to_le32(0xC000005C)
-#define STATUS_CANT_DISABLE_MANDATORY cpu_to_le32(0xC000005D)
-#define STATUS_NO_LOGON_SERVERS cpu_to_le32(0xC000005E)
-#define STATUS_NO_SUCH_LOGON_SESSION cpu_to_le32(0xC000005F)
-#define STATUS_NO_SUCH_PRIVILEGE cpu_to_le32(0xC0000060)
-#define STATUS_PRIVILEGE_NOT_HELD cpu_to_le32(0xC0000061)
-#define STATUS_INVALID_ACCOUNT_NAME cpu_to_le32(0xC0000062)
-#define STATUS_USER_EXISTS cpu_to_le32(0xC0000063)
-#define STATUS_NO_SUCH_USER cpu_to_le32(0xC0000064)
-#define STATUS_GROUP_EXISTS cpu_to_le32(0xC0000065)
-#define STATUS_NO_SUCH_GROUP cpu_to_le32(0xC0000066)
-#define STATUS_MEMBER_IN_GROUP cpu_to_le32(0xC0000067)
-#define STATUS_MEMBER_NOT_IN_GROUP cpu_to_le32(0xC0000068)
-#define STATUS_LAST_ADMIN cpu_to_le32(0xC0000069)
-#define STATUS_WRONG_PASSWORD cpu_to_le32(0xC000006A)
-#define STATUS_ILL_FORMED_PASSWORD cpu_to_le32(0xC000006B)
-#define STATUS_PASSWORD_RESTRICTION cpu_to_le32(0xC000006C)
-#define STATUS_LOGON_FAILURE cpu_to_le32(0xC000006D)
-#define STATUS_ACCOUNT_RESTRICTION cpu_to_le32(0xC000006E)
-#define STATUS_INVALID_LOGON_HOURS cpu_to_le32(0xC000006F)
-#define STATUS_INVALID_WORKSTATION cpu_to_le32(0xC0000070)
-#define STATUS_PASSWORD_EXPIRED cpu_to_le32(0xC0000071)
-#define STATUS_ACCOUNT_DISABLED cpu_to_le32(0xC0000072)
-#define STATUS_NONE_MAPPED cpu_to_le32(0xC0000073)
-#define STATUS_TOO_MANY_LUIDS_REQUESTED cpu_to_le32(0xC0000074)
-#define STATUS_LUIDS_EXHAUSTED cpu_to_le32(0xC0000075)
-#define STATUS_INVALID_SUB_AUTHORITY cpu_to_le32(0xC0000076)
-#define STATUS_INVALID_ACL cpu_to_le32(0xC0000077)
-#define STATUS_INVALID_SID cpu_to_le32(0xC0000078)
-#define STATUS_INVALID_SECURITY_DESCR cpu_to_le32(0xC0000079)
-#define STATUS_PROCEDURE_NOT_FOUND cpu_to_le32(0xC000007A)
-#define STATUS_INVALID_IMAGE_FORMAT cpu_to_le32(0xC000007B)
-#define STATUS_NO_TOKEN cpu_to_le32(0xC000007C)
-#define STATUS_BAD_INHERITANCE_ACL cpu_to_le32(0xC000007D)
-#define STATUS_RANGE_NOT_LOCKED cpu_to_le32(0xC000007E)
-#define STATUS_DISK_FULL cpu_to_le32(0xC000007F)
-#define STATUS_SERVER_DISABLED cpu_to_le32(0xC0000080)
-#define STATUS_SERVER_NOT_DISABLED cpu_to_le32(0xC0000081)
-#define STATUS_TOO_MANY_GUIDS_REQUESTED cpu_to_le32(0xC0000082)
-#define STATUS_GUIDS_EXHAUSTED cpu_to_le32(0xC0000083)
-#define STATUS_INVALID_ID_AUTHORITY cpu_to_le32(0xC0000084)
-#define STATUS_AGENTS_EXHAUSTED cpu_to_le32(0xC0000085)
-#define STATUS_INVALID_VOLUME_LABEL cpu_to_le32(0xC0000086)
-#define STATUS_SECTION_NOT_EXTENDED cpu_to_le32(0xC0000087)
-#define STATUS_NOT_MAPPED_DATA cpu_to_le32(0xC0000088)
-#define STATUS_RESOURCE_DATA_NOT_FOUND cpu_to_le32(0xC0000089)
-#define STATUS_RESOURCE_TYPE_NOT_FOUND cpu_to_le32(0xC000008A)
-#define STATUS_RESOURCE_NAME_NOT_FOUND cpu_to_le32(0xC000008B)
-#define STATUS_ARRAY_BOUNDS_EXCEEDED cpu_to_le32(0xC000008C)
-#define STATUS_FLOAT_DENORMAL_OPERAND cpu_to_le32(0xC000008D)
-#define STATUS_FLOAT_DIVIDE_BY_ZERO cpu_to_le32(0xC000008E)
-#define STATUS_FLOAT_INEXACT_RESULT cpu_to_le32(0xC000008F)
-#define STATUS_FLOAT_INVALID_OPERATION cpu_to_le32(0xC0000090)
-#define STATUS_FLOAT_OVERFLOW cpu_to_le32(0xC0000091)
-#define STATUS_FLOAT_STACK_CHECK cpu_to_le32(0xC0000092)
-#define STATUS_FLOAT_UNDERFLOW cpu_to_le32(0xC0000093)
-#define STATUS_INTEGER_DIVIDE_BY_ZERO cpu_to_le32(0xC0000094)
-#define STATUS_INTEGER_OVERFLOW cpu_to_le32(0xC0000095)
-#define STATUS_PRIVILEGED_INSTRUCTION cpu_to_le32(0xC0000096)
-#define STATUS_TOO_MANY_PAGING_FILES cpu_to_le32(0xC0000097)
-#define STATUS_FILE_INVALID cpu_to_le32(0xC0000098)
-#define STATUS_ALLOTTED_SPACE_EXCEEDED cpu_to_le32(0xC0000099)
-#define STATUS_INSUFFICIENT_RESOURCES cpu_to_le32(0xC000009A)
-#define STATUS_DFS_EXIT_PATH_FOUND cpu_to_le32(0xC000009B)
-#define STATUS_DEVICE_DATA_ERROR cpu_to_le32(0xC000009C)
-#define STATUS_DEVICE_NOT_CONNECTED cpu_to_le32(0xC000009D)
-#define STATUS_DEVICE_POWER_FAILURE cpu_to_le32(0xC000009E)
-#define STATUS_FREE_VM_NOT_AT_BASE cpu_to_le32(0xC000009F)
-#define STATUS_MEMORY_NOT_ALLOCATED cpu_to_le32(0xC00000A0)
-#define STATUS_WORKING_SET_QUOTA cpu_to_le32(0xC00000A1)
-#define STATUS_MEDIA_WRITE_PROTECTED cpu_to_le32(0xC00000A2)
-#define STATUS_DEVICE_NOT_READY cpu_to_le32(0xC00000A3)
-#define STATUS_INVALID_GROUP_ATTRIBUTES cpu_to_le32(0xC00000A4)
-#define STATUS_BAD_IMPERSONATION_LEVEL cpu_to_le32(0xC00000A5)
-#define STATUS_CANT_OPEN_ANONYMOUS cpu_to_le32(0xC00000A6)
-#define STATUS_BAD_VALIDATION_CLASS cpu_to_le32(0xC00000A7)
-#define STATUS_BAD_TOKEN_TYPE cpu_to_le32(0xC00000A8)
-#define STATUS_BAD_MASTER_BOOT_RECORD cpu_to_le32(0xC00000A9)
-#define STATUS_INSTRUCTION_MISALIGNMENT cpu_to_le32(0xC00000AA)
-#define STATUS_INSTANCE_NOT_AVAILABLE cpu_to_le32(0xC00000AB)
-#define STATUS_PIPE_NOT_AVAILABLE cpu_to_le32(0xC00000AC)
-#define STATUS_INVALID_PIPE_STATE cpu_to_le32(0xC00000AD)
-#define STATUS_PIPE_BUSY cpu_to_le32(0xC00000AE)
-#define STATUS_ILLEGAL_FUNCTION cpu_to_le32(0xC00000AF)
-#define STATUS_PIPE_DISCONNECTED cpu_to_le32(0xC00000B0)
-#define STATUS_PIPE_CLOSING cpu_to_le32(0xC00000B1)
-#define STATUS_PIPE_CONNECTED cpu_to_le32(0xC00000B2)
-#define STATUS_PIPE_LISTENING cpu_to_le32(0xC00000B3)
-#define STATUS_INVALID_READ_MODE cpu_to_le32(0xC00000B4)
-#define STATUS_IO_TIMEOUT cpu_to_le32(0xC00000B5)
-#define STATUS_FILE_FORCED_CLOSED cpu_to_le32(0xC00000B6)
-#define STATUS_PROFILING_NOT_STARTED cpu_to_le32(0xC00000B7)
-#define STATUS_PROFILING_NOT_STOPPED cpu_to_le32(0xC00000B8)
-#define STATUS_COULD_NOT_INTERPRET cpu_to_le32(0xC00000B9)
-#define STATUS_FILE_IS_A_DIRECTORY cpu_to_le32(0xC00000BA)
-#define STATUS_NOT_SUPPORTED cpu_to_le32(0xC00000BB)
-#define STATUS_REMOTE_NOT_LISTENING cpu_to_le32(0xC00000BC)
-#define STATUS_DUPLICATE_NAME cpu_to_le32(0xC00000BD)
-#define STATUS_BAD_NETWORK_PATH cpu_to_le32(0xC00000BE)
-#define STATUS_NETWORK_BUSY cpu_to_le32(0xC00000BF)
-#define STATUS_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC00000C0)
-#define STATUS_TOO_MANY_COMMANDS cpu_to_le32(0xC00000C1)
-#define STATUS_ADAPTER_HARDWARE_ERROR cpu_to_le32(0xC00000C2)
-#define STATUS_INVALID_NETWORK_RESPONSE cpu_to_le32(0xC00000C3)
-#define STATUS_UNEXPECTED_NETWORK_ERROR cpu_to_le32(0xC00000C4)
-#define STATUS_BAD_REMOTE_ADAPTER cpu_to_le32(0xC00000C5)
-#define STATUS_PRINT_QUEUE_FULL cpu_to_le32(0xC00000C6)
-#define STATUS_NO_SPOOL_SPACE cpu_to_le32(0xC00000C7)
-#define STATUS_PRINT_CANCELLED cpu_to_le32(0xC00000C8)
-#define STATUS_NETWORK_NAME_DELETED cpu_to_le32(0xC00000C9)
-#define STATUS_NETWORK_ACCESS_DENIED cpu_to_le32(0xC00000CA)
-#define STATUS_BAD_DEVICE_TYPE cpu_to_le32(0xC00000CB)
-#define STATUS_BAD_NETWORK_NAME cpu_to_le32(0xC00000CC)
-#define STATUS_TOO_MANY_NAMES cpu_to_le32(0xC00000CD)
-#define STATUS_TOO_MANY_SESSIONS cpu_to_le32(0xC00000CE)
-#define STATUS_SHARING_PAUSED cpu_to_le32(0xC00000CF)
-#define STATUS_REQUEST_NOT_ACCEPTED cpu_to_le32(0xC00000D0)
-#define STATUS_REDIRECTOR_PAUSED cpu_to_le32(0xC00000D1)
-#define STATUS_NET_WRITE_FAULT cpu_to_le32(0xC00000D2)
-#define STATUS_PROFILING_AT_LIMIT cpu_to_le32(0xC00000D3)
-#define STATUS_NOT_SAME_DEVICE cpu_to_le32(0xC00000D4)
-#define STATUS_FILE_RENAMED cpu_to_le32(0xC00000D5)
-#define STATUS_VIRTUAL_CIRCUIT_CLOSED cpu_to_le32(0xC00000D6)
-#define STATUS_NO_SECURITY_ON_OBJECT cpu_to_le32(0xC00000D7)
-#define STATUS_CANT_WAIT cpu_to_le32(0xC00000D8)
-#define STATUS_PIPE_EMPTY cpu_to_le32(0xC00000D9)
-#define STATUS_CANT_ACCESS_DOMAIN_INFO cpu_to_le32(0xC00000DA)
-#define STATUS_CANT_TERMINATE_SELF cpu_to_le32(0xC00000DB)
-#define STATUS_INVALID_SERVER_STATE cpu_to_le32(0xC00000DC)
-#define STATUS_INVALID_DOMAIN_STATE cpu_to_le32(0xC00000DD)
-#define STATUS_INVALID_DOMAIN_ROLE cpu_to_le32(0xC00000DE)
-#define STATUS_NO_SUCH_DOMAIN cpu_to_le32(0xC00000DF)
-#define STATUS_DOMAIN_EXISTS cpu_to_le32(0xC00000E0)
-#define STATUS_DOMAIN_LIMIT_EXCEEDED cpu_to_le32(0xC00000E1)
-#define STATUS_OPLOCK_NOT_GRANTED cpu_to_le32(0xC00000E2)
-#define STATUS_INVALID_OPLOCK_PROTOCOL cpu_to_le32(0xC00000E3)
-#define STATUS_INTERNAL_DB_CORRUPTION cpu_to_le32(0xC00000E4)
-#define STATUS_INTERNAL_ERROR cpu_to_le32(0xC00000E5)
-#define STATUS_GENERIC_NOT_MAPPED cpu_to_le32(0xC00000E6)
-#define STATUS_BAD_DESCRIPTOR_FORMAT cpu_to_le32(0xC00000E7)
-#define STATUS_INVALID_USER_BUFFER cpu_to_le32(0xC00000E8)
-#define STATUS_UNEXPECTED_IO_ERROR cpu_to_le32(0xC00000E9)
-#define STATUS_UNEXPECTED_MM_CREATE_ERR cpu_to_le32(0xC00000EA)
-#define STATUS_UNEXPECTED_MM_MAP_ERROR cpu_to_le32(0xC00000EB)
-#define STATUS_UNEXPECTED_MM_EXTEND_ERR cpu_to_le32(0xC00000EC)
-#define STATUS_NOT_LOGON_PROCESS cpu_to_le32(0xC00000ED)
-#define STATUS_LOGON_SESSION_EXISTS cpu_to_le32(0xC00000EE)
-#define STATUS_INVALID_PARAMETER_1 cpu_to_le32(0xC00000EF)
-#define STATUS_INVALID_PARAMETER_2 cpu_to_le32(0xC00000F0)
-#define STATUS_INVALID_PARAMETER_3 cpu_to_le32(0xC00000F1)
-#define STATUS_INVALID_PARAMETER_4 cpu_to_le32(0xC00000F2)
-#define STATUS_INVALID_PARAMETER_5 cpu_to_le32(0xC00000F3)
-#define STATUS_INVALID_PARAMETER_6 cpu_to_le32(0xC00000F4)
-#define STATUS_INVALID_PARAMETER_7 cpu_to_le32(0xC00000F5)
-#define STATUS_INVALID_PARAMETER_8 cpu_to_le32(0xC00000F6)
-#define STATUS_INVALID_PARAMETER_9 cpu_to_le32(0xC00000F7)
-#define STATUS_INVALID_PARAMETER_10 cpu_to_le32(0xC00000F8)
-#define STATUS_INVALID_PARAMETER_11 cpu_to_le32(0xC00000F9)
-#define STATUS_INVALID_PARAMETER_12 cpu_to_le32(0xC00000FA)
-#define STATUS_REDIRECTOR_NOT_STARTED cpu_to_le32(0xC00000FB)
-#define STATUS_REDIRECTOR_STARTED cpu_to_le32(0xC00000FC)
-#define STATUS_STACK_OVERFLOW cpu_to_le32(0xC00000FD)
-#define STATUS_NO_SUCH_PACKAGE cpu_to_le32(0xC00000FE)
-#define STATUS_BAD_FUNCTION_TABLE cpu_to_le32(0xC00000FF)
-#define STATUS_VARIABLE_NOT_FOUND cpu_to_le32(0xC0000100)
-#define STATUS_DIRECTORY_NOT_EMPTY cpu_to_le32(0xC0000101)
-#define STATUS_FILE_CORRUPT_ERROR cpu_to_le32(0xC0000102)
-#define STATUS_NOT_A_DIRECTORY cpu_to_le32(0xC0000103)
-#define STATUS_BAD_LOGON_SESSION_STATE cpu_to_le32(0xC0000104)
-#define STATUS_LOGON_SESSION_COLLISION cpu_to_le32(0xC0000105)
-#define STATUS_NAME_TOO_LONG cpu_to_le32(0xC0000106)
-#define STATUS_FILES_OPEN cpu_to_le32(0xC0000107)
-#define STATUS_CONNECTION_IN_USE cpu_to_le32(0xC0000108)
-#define STATUS_MESSAGE_NOT_FOUND cpu_to_le32(0xC0000109)
-#define STATUS_PROCESS_IS_TERMINATING cpu_to_le32(0xC000010A)
-#define STATUS_INVALID_LOGON_TYPE cpu_to_le32(0xC000010B)
-#define STATUS_NO_GUID_TRANSLATION cpu_to_le32(0xC000010C)
-#define STATUS_CANNOT_IMPERSONATE cpu_to_le32(0xC000010D)
-#define STATUS_IMAGE_ALREADY_LOADED cpu_to_le32(0xC000010E)
-#define STATUS_ABIOS_NOT_PRESENT cpu_to_le32(0xC000010F)
-#define STATUS_ABIOS_LID_NOT_EXIST cpu_to_le32(0xC0000110)
-#define STATUS_ABIOS_LID_ALREADY_OWNED cpu_to_le32(0xC0000111)
-#define STATUS_ABIOS_NOT_LID_OWNER cpu_to_le32(0xC0000112)
-#define STATUS_ABIOS_INVALID_COMMAND cpu_to_le32(0xC0000113)
-#define STATUS_ABIOS_INVALID_LID cpu_to_le32(0xC0000114)
-#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE cpu_to_le32(0xC0000115)
-#define STATUS_ABIOS_INVALID_SELECTOR cpu_to_le32(0xC0000116)
-#define STATUS_NO_LDT cpu_to_le32(0xC0000117)
-#define STATUS_INVALID_LDT_SIZE cpu_to_le32(0xC0000118)
-#define STATUS_INVALID_LDT_OFFSET cpu_to_le32(0xC0000119)
-#define STATUS_INVALID_LDT_DESCRIPTOR cpu_to_le32(0xC000011A)
-#define STATUS_INVALID_IMAGE_NE_FORMAT cpu_to_le32(0xC000011B)
-#define STATUS_RXACT_INVALID_STATE cpu_to_le32(0xC000011C)
-#define STATUS_RXACT_COMMIT_FAILURE cpu_to_le32(0xC000011D)
-#define STATUS_MAPPED_FILE_SIZE_ZERO cpu_to_le32(0xC000011E)
-#define STATUS_TOO_MANY_OPENED_FILES cpu_to_le32(0xC000011F)
-#define STATUS_CANCELLED cpu_to_le32(0xC0000120)
-#define STATUS_CANNOT_DELETE cpu_to_le32(0xC0000121)
-#define STATUS_INVALID_COMPUTER_NAME cpu_to_le32(0xC0000122)
-#define STATUS_FILE_DELETED cpu_to_le32(0xC0000123)
-#define STATUS_SPECIAL_ACCOUNT cpu_to_le32(0xC0000124)
-#define STATUS_SPECIAL_GROUP cpu_to_le32(0xC0000125)
-#define STATUS_SPECIAL_USER cpu_to_le32(0xC0000126)
-#define STATUS_MEMBERS_PRIMARY_GROUP cpu_to_le32(0xC0000127)
-#define STATUS_FILE_CLOSED cpu_to_le32(0xC0000128)
-#define STATUS_TOO_MANY_THREADS cpu_to_le32(0xC0000129)
-#define STATUS_THREAD_NOT_IN_PROCESS cpu_to_le32(0xC000012A)
-#define STATUS_TOKEN_ALREADY_IN_USE cpu_to_le32(0xC000012B)
-#define STATUS_PAGEFILE_QUOTA_EXCEEDED cpu_to_le32(0xC000012C)
-#define STATUS_COMMITMENT_LIMIT cpu_to_le32(0xC000012D)
-#define STATUS_INVALID_IMAGE_LE_FORMAT cpu_to_le32(0xC000012E)
-#define STATUS_INVALID_IMAGE_NOT_MZ cpu_to_le32(0xC000012F)
-#define STATUS_INVALID_IMAGE_PROTECT cpu_to_le32(0xC0000130)
-#define STATUS_INVALID_IMAGE_WIN_16 cpu_to_le32(0xC0000131)
-#define STATUS_LOGON_SERVER_CONFLICT cpu_to_le32(0xC0000132)
-#define STATUS_TIME_DIFFERENCE_AT_DC cpu_to_le32(0xC0000133)
-#define STATUS_SYNCHRONIZATION_REQUIRED cpu_to_le32(0xC0000134)
-#define STATUS_DLL_NOT_FOUND cpu_to_le32(0xC0000135)
-#define STATUS_OPEN_FAILED cpu_to_le32(0xC0000136)
-#define STATUS_IO_PRIVILEGE_FAILED cpu_to_le32(0xC0000137)
-#define STATUS_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000138)
-#define STATUS_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000139)
-#define STATUS_CONTROL_C_EXIT cpu_to_le32(0xC000013A)
-#define STATUS_LOCAL_DISCONNECT cpu_to_le32(0xC000013B)
-#define STATUS_REMOTE_DISCONNECT cpu_to_le32(0xC000013C)
-#define STATUS_REMOTE_RESOURCES cpu_to_le32(0xC000013D)
-#define STATUS_LINK_FAILED cpu_to_le32(0xC000013E)
-#define STATUS_LINK_TIMEOUT cpu_to_le32(0xC000013F)
-#define STATUS_INVALID_CONNECTION cpu_to_le32(0xC0000140)
-#define STATUS_INVALID_ADDRESS cpu_to_le32(0xC0000141)
-#define STATUS_DLL_INIT_FAILED cpu_to_le32(0xC0000142)
-#define STATUS_MISSING_SYSTEMFILE cpu_to_le32(0xC0000143)
-#define STATUS_UNHANDLED_EXCEPTION cpu_to_le32(0xC0000144)
-#define STATUS_APP_INIT_FAILURE cpu_to_le32(0xC0000145)
-#define STATUS_PAGEFILE_CREATE_FAILED cpu_to_le32(0xC0000146)
-#define STATUS_NO_PAGEFILE cpu_to_le32(0xC0000147)
-#define STATUS_INVALID_LEVEL cpu_to_le32(0xC0000148)
-#define STATUS_WRONG_PASSWORD_CORE cpu_to_le32(0xC0000149)
-#define STATUS_ILLEGAL_FLOAT_CONTEXT cpu_to_le32(0xC000014A)
-#define STATUS_PIPE_BROKEN cpu_to_le32(0xC000014B)
-#define STATUS_REGISTRY_CORRUPT cpu_to_le32(0xC000014C)
-#define STATUS_REGISTRY_IO_FAILED cpu_to_le32(0xC000014D)
-#define STATUS_NO_EVENT_PAIR cpu_to_le32(0xC000014E)
-#define STATUS_UNRECOGNIZED_VOLUME cpu_to_le32(0xC000014F)
-#define STATUS_SERIAL_NO_DEVICE_INITED cpu_to_le32(0xC0000150)
-#define STATUS_NO_SUCH_ALIAS cpu_to_le32(0xC0000151)
-#define STATUS_MEMBER_NOT_IN_ALIAS cpu_to_le32(0xC0000152)
-#define STATUS_MEMBER_IN_ALIAS cpu_to_le32(0xC0000153)
-#define STATUS_ALIAS_EXISTS cpu_to_le32(0xC0000154)
-#define STATUS_LOGON_NOT_GRANTED cpu_to_le32(0xC0000155)
-#define STATUS_TOO_MANY_SECRETS cpu_to_le32(0xC0000156)
-#define STATUS_SECRET_TOO_LONG cpu_to_le32(0xC0000157)
-#define STATUS_INTERNAL_DB_ERROR cpu_to_le32(0xC0000158)
-#define STATUS_FULLSCREEN_MODE cpu_to_le32(0xC0000159)
-#define STATUS_TOO_MANY_CONTEXT_IDS cpu_to_le32(0xC000015A)
-#define STATUS_LOGON_TYPE_NOT_GRANTED cpu_to_le32(0xC000015B)
-#define STATUS_NOT_REGISTRY_FILE cpu_to_le32(0xC000015C)
-#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000015D)
-#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR cpu_to_le32(0xC000015E)
-#define STATUS_FT_MISSING_MEMBER cpu_to_le32(0xC000015F)
-#define STATUS_ILL_FORMED_SERVICE_ENTRY cpu_to_le32(0xC0000160)
-#define STATUS_ILLEGAL_CHARACTER cpu_to_le32(0xC0000161)
-#define STATUS_UNMAPPABLE_CHARACTER cpu_to_le32(0xC0000162)
-#define STATUS_UNDEFINED_CHARACTER cpu_to_le32(0xC0000163)
-#define STATUS_FLOPPY_VOLUME cpu_to_le32(0xC0000164)
-#define STATUS_FLOPPY_ID_MARK_NOT_FOUND cpu_to_le32(0xC0000165)
-#define STATUS_FLOPPY_WRONG_CYLINDER cpu_to_le32(0xC0000166)
-#define STATUS_FLOPPY_UNKNOWN_ERROR cpu_to_le32(0xC0000167)
-#define STATUS_FLOPPY_BAD_REGISTERS cpu_to_le32(0xC0000168)
-#define STATUS_DISK_RECALIBRATE_FAILED cpu_to_le32(0xC0000169)
-#define STATUS_DISK_OPERATION_FAILED cpu_to_le32(0xC000016A)
-#define STATUS_DISK_RESET_FAILED cpu_to_le32(0xC000016B)
-#define STATUS_SHARED_IRQ_BUSY cpu_to_le32(0xC000016C)
-#define STATUS_FT_ORPHANING cpu_to_le32(0xC000016D)
-#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT cpu_to_le32(0xC000016E)
-#define STATUS_PARTITION_FAILURE cpu_to_le32(0xC0000172)
-#define STATUS_INVALID_BLOCK_LENGTH cpu_to_le32(0xC0000173)
-#define STATUS_DEVICE_NOT_PARTITIONED cpu_to_le32(0xC0000174)
-#define STATUS_UNABLE_TO_LOCK_MEDIA cpu_to_le32(0xC0000175)
-#define STATUS_UNABLE_TO_UNLOAD_MEDIA cpu_to_le32(0xC0000176)
-#define STATUS_EOM_OVERFLOW cpu_to_le32(0xC0000177)
-#define STATUS_NO_MEDIA cpu_to_le32(0xC0000178)
-#define STATUS_NO_SUCH_MEMBER cpu_to_le32(0xC000017A)
-#define STATUS_INVALID_MEMBER cpu_to_le32(0xC000017B)
-#define STATUS_KEY_DELETED cpu_to_le32(0xC000017C)
-#define STATUS_NO_LOG_SPACE cpu_to_le32(0xC000017D)
-#define STATUS_TOO_MANY_SIDS cpu_to_le32(0xC000017E)
-#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000017F)
-#define STATUS_KEY_HAS_CHILDREN cpu_to_le32(0xC0000180)
-#define STATUS_CHILD_MUST_BE_VOLATILE cpu_to_le32(0xC0000181)
-#define STATUS_DEVICE_CONFIGURATION_ERROR cpu_to_le32(0xC0000182)
-#define STATUS_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC0000183)
-#define STATUS_INVALID_DEVICE_STATE cpu_to_le32(0xC0000184)
-#define STATUS_IO_DEVICE_ERROR cpu_to_le32(0xC0000185)
-#define STATUS_DEVICE_PROTOCOL_ERROR cpu_to_le32(0xC0000186)
-#define STATUS_BACKUP_CONTROLLER cpu_to_le32(0xC0000187)
-#define STATUS_LOG_FILE_FULL cpu_to_le32(0xC0000188)
-#define STATUS_TOO_LATE cpu_to_le32(0xC0000189)
-#define STATUS_NO_TRUST_LSA_SECRET cpu_to_le32(0xC000018A)
-#define STATUS_NO_TRUST_SAM_ACCOUNT cpu_to_le32(0xC000018B)
-#define STATUS_TRUSTED_DOMAIN_FAILURE cpu_to_le32(0xC000018C)
-#define STATUS_TRUSTED_RELATIONSHIP_FAILURE cpu_to_le32(0xC000018D)
-#define STATUS_EVENTLOG_FILE_CORRUPT cpu_to_le32(0xC000018E)
-#define STATUS_EVENTLOG_CANT_START cpu_to_le32(0xC000018F)
-#define STATUS_TRUST_FAILURE cpu_to_le32(0xC0000190)
-#define STATUS_MUTANT_LIMIT_EXCEEDED cpu_to_le32(0xC0000191)
-#define STATUS_NETLOGON_NOT_STARTED cpu_to_le32(0xC0000192)
-#define STATUS_ACCOUNT_EXPIRED cpu_to_le32(0xC0000193)
-#define STATUS_POSSIBLE_DEADLOCK cpu_to_le32(0xC0000194)
-#define STATUS_NETWORK_CREDENTIAL_CONFLICT cpu_to_le32(0xC0000195)
-#define STATUS_REMOTE_SESSION_LIMIT cpu_to_le32(0xC0000196)
-#define STATUS_EVENTLOG_FILE_CHANGED cpu_to_le32(0xC0000197)
-#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT cpu_to_le32(0xC0000198)
-#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT cpu_to_le32(0xC0000199)
-#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT cpu_to_le32(0xC000019A)
-#define STATUS_DOMAIN_TRUST_INCONSISTENT cpu_to_le32(0xC000019B)
-#define STATUS_FS_DRIVER_REQUIRED cpu_to_le32(0xC000019C)
-#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL cpu_to_le32(0xC000019D)
-#define STATUS_NETWORK_OPEN_RESTRICTION cpu_to_le32(0xC0000201)
-#define STATUS_NO_USER_SESSION_KEY cpu_to_le32(0xC0000202)
-#define STATUS_USER_SESSION_DELETED cpu_to_le32(0xC0000203)
-#define STATUS_RESOURCE_LANG_NOT_FOUND cpu_to_le32(0xC0000204)
-#define STATUS_INSUFF_SERVER_RESOURCES cpu_to_le32(0xC0000205)
-#define STATUS_INVALID_BUFFER_SIZE cpu_to_le32(0xC0000206)
-#define STATUS_INVALID_ADDRESS_COMPONENT cpu_to_le32(0xC0000207)
-#define STATUS_INVALID_ADDRESS_WILDCARD cpu_to_le32(0xC0000208)
-#define STATUS_TOO_MANY_ADDRESSES cpu_to_le32(0xC0000209)
-#define STATUS_ADDRESS_ALREADY_EXISTS cpu_to_le32(0xC000020A)
-#define STATUS_ADDRESS_CLOSED cpu_to_le32(0xC000020B)
-#define STATUS_CONNECTION_DISCONNECTED cpu_to_le32(0xC000020C)
-#define STATUS_CONNECTION_RESET cpu_to_le32(0xC000020D)
-#define STATUS_TOO_MANY_NODES cpu_to_le32(0xC000020E)
-#define STATUS_TRANSACTION_ABORTED cpu_to_le32(0xC000020F)
-#define STATUS_TRANSACTION_TIMED_OUT cpu_to_le32(0xC0000210)
-#define STATUS_TRANSACTION_NO_RELEASE cpu_to_le32(0xC0000211)
-#define STATUS_TRANSACTION_NO_MATCH cpu_to_le32(0xC0000212)
-#define STATUS_TRANSACTION_RESPONDED cpu_to_le32(0xC0000213)
-#define STATUS_TRANSACTION_INVALID_ID cpu_to_le32(0xC0000214)
-#define STATUS_TRANSACTION_INVALID_TYPE cpu_to_le32(0xC0000215)
-#define STATUS_NOT_SERVER_SESSION cpu_to_le32(0xC0000216)
-#define STATUS_NOT_CLIENT_SESSION cpu_to_le32(0xC0000217)
-#define STATUS_CANNOT_LOAD_REGISTRY_FILE cpu_to_le32(0xC0000218)
-#define STATUS_DEBUG_ATTACH_FAILED cpu_to_le32(0xC0000219)
-#define STATUS_SYSTEM_PROCESS_TERMINATED cpu_to_le32(0xC000021A)
-#define STATUS_DATA_NOT_ACCEPTED cpu_to_le32(0xC000021B)
-#define STATUS_NO_BROWSER_SERVERS_FOUND cpu_to_le32(0xC000021C)
-#define STATUS_VDM_HARD_ERROR cpu_to_le32(0xC000021D)
-#define STATUS_DRIVER_CANCEL_TIMEOUT cpu_to_le32(0xC000021E)
-#define STATUS_REPLY_MESSAGE_MISMATCH cpu_to_le32(0xC000021F)
-#define STATUS_MAPPED_ALIGNMENT cpu_to_le32(0xC0000220)
-#define STATUS_IMAGE_CHECKSUM_MISMATCH cpu_to_le32(0xC0000221)
-#define STATUS_LOST_WRITEBEHIND_DATA cpu_to_le32(0xC0000222)
-#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID cpu_to_le32(0xC0000223)
-#define STATUS_PASSWORD_MUST_CHANGE cpu_to_le32(0xC0000224)
-#define STATUS_NOT_FOUND cpu_to_le32(0xC0000225)
-#define STATUS_NOT_TINY_STREAM cpu_to_le32(0xC0000226)
-#define STATUS_RECOVERY_FAILURE cpu_to_le32(0xC0000227)
-#define STATUS_STACK_OVERFLOW_READ cpu_to_le32(0xC0000228)
-#define STATUS_FAIL_CHECK cpu_to_le32(0xC0000229)
-#define STATUS_DUPLICATE_OBJECTID cpu_to_le32(0xC000022A)
-#define STATUS_OBJECTID_EXISTS cpu_to_le32(0xC000022B)
-#define STATUS_CONVERT_TO_LARGE cpu_to_le32(0xC000022C)
-#define STATUS_RETRY cpu_to_le32(0xC000022D)
-#define STATUS_FOUND_OUT_OF_SCOPE cpu_to_le32(0xC000022E)
-#define STATUS_ALLOCATE_BUCKET cpu_to_le32(0xC000022F)
-#define STATUS_PROPSET_NOT_FOUND cpu_to_le32(0xC0000230)
-#define STATUS_MARSHALL_OVERFLOW cpu_to_le32(0xC0000231)
-#define STATUS_INVALID_VARIANT cpu_to_le32(0xC0000232)
-#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND cpu_to_le32(0xC0000233)
-#define STATUS_ACCOUNT_LOCKED_OUT cpu_to_le32(0xC0000234)
-#define STATUS_HANDLE_NOT_CLOSABLE cpu_to_le32(0xC0000235)
-#define STATUS_CONNECTION_REFUSED cpu_to_le32(0xC0000236)
-#define STATUS_GRACEFUL_DISCONNECT cpu_to_le32(0xC0000237)
-#define STATUS_ADDRESS_ALREADY_ASSOCIATED cpu_to_le32(0xC0000238)
-#define STATUS_ADDRESS_NOT_ASSOCIATED cpu_to_le32(0xC0000239)
-#define STATUS_CONNECTION_INVALID cpu_to_le32(0xC000023A)
-#define STATUS_CONNECTION_ACTIVE cpu_to_le32(0xC000023B)
-#define STATUS_NETWORK_UNREACHABLE cpu_to_le32(0xC000023C)
-#define STATUS_HOST_UNREACHABLE cpu_to_le32(0xC000023D)
-#define STATUS_PROTOCOL_UNREACHABLE cpu_to_le32(0xC000023E)
-#define STATUS_PORT_UNREACHABLE cpu_to_le32(0xC000023F)
-#define STATUS_REQUEST_ABORTED cpu_to_le32(0xC0000240)
-#define STATUS_CONNECTION_ABORTED cpu_to_le32(0xC0000241)
-#define STATUS_BAD_COMPRESSION_BUFFER cpu_to_le32(0xC0000242)
-#define STATUS_USER_MAPPED_FILE cpu_to_le32(0xC0000243)
-#define STATUS_AUDIT_FAILED cpu_to_le32(0xC0000244)
-#define STATUS_TIMER_RESOLUTION_NOT_SET cpu_to_le32(0xC0000245)
-#define STATUS_CONNECTION_COUNT_LIMIT cpu_to_le32(0xC0000246)
-#define STATUS_LOGIN_TIME_RESTRICTION cpu_to_le32(0xC0000247)
-#define STATUS_LOGIN_WKSTA_RESTRICTION cpu_to_le32(0xC0000248)
-#define STATUS_IMAGE_MP_UP_MISMATCH cpu_to_le32(0xC0000249)
-#define STATUS_INSUFFICIENT_LOGON_INFO cpu_to_le32(0xC0000250)
-#define STATUS_BAD_DLL_ENTRYPOINT cpu_to_le32(0xC0000251)
-#define STATUS_BAD_SERVICE_ENTRYPOINT cpu_to_le32(0xC0000252)
-#define STATUS_LPC_REPLY_LOST cpu_to_le32(0xC0000253)
-#define STATUS_IP_ADDRESS_CONFLICT1 cpu_to_le32(0xC0000254)
-#define STATUS_IP_ADDRESS_CONFLICT2 cpu_to_le32(0xC0000255)
-#define STATUS_REGISTRY_QUOTA_LIMIT cpu_to_le32(0xC0000256)
-#define STATUS_PATH_NOT_COVERED cpu_to_le32(0xC0000257)
-#define STATUS_NO_CALLBACK_ACTIVE cpu_to_le32(0xC0000258)
-#define STATUS_LICENSE_QUOTA_EXCEEDED cpu_to_le32(0xC0000259)
-#define STATUS_PWD_TOO_SHORT cpu_to_le32(0xC000025A)
-#define STATUS_PWD_TOO_RECENT cpu_to_le32(0xC000025B)
-#define STATUS_PWD_HISTORY_CONFLICT cpu_to_le32(0xC000025C)
-#define STATUS_PLUGPLAY_NO_DEVICE cpu_to_le32(0xC000025E)
-#define STATUS_UNSUPPORTED_COMPRESSION cpu_to_le32(0xC000025F)
-#define STATUS_INVALID_HW_PROFILE cpu_to_le32(0xC0000260)
-#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH cpu_to_le32(0xC0000261)
-#define STATUS_DRIVER_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000262)
-#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000263)
-#define STATUS_RESOURCE_NOT_OWNED cpu_to_le32(0xC0000264)
-#define STATUS_TOO_MANY_LINKS cpu_to_le32(0xC0000265)
-#define STATUS_QUOTA_LIST_INCONSISTENT cpu_to_le32(0xC0000266)
-#define STATUS_FILE_IS_OFFLINE cpu_to_le32(0xC0000267)
-#define STATUS_EVALUATION_EXPIRATION cpu_to_le32(0xC0000268)
-#define STATUS_ILLEGAL_DLL_RELOCATION cpu_to_le32(0xC0000269)
-#define STATUS_LICENSE_VIOLATION cpu_to_le32(0xC000026A)
-#define STATUS_DLL_INIT_FAILED_LOGOFF cpu_to_le32(0xC000026B)
-#define STATUS_DRIVER_UNABLE_TO_LOAD cpu_to_le32(0xC000026C)
-#define STATUS_DFS_UNAVAILABLE cpu_to_le32(0xC000026D)
-#define STATUS_VOLUME_DISMOUNTED cpu_to_le32(0xC000026E)
-#define STATUS_WX86_INTERNAL_ERROR cpu_to_le32(0xC000026F)
-#define STATUS_WX86_FLOAT_STACK_CHECK cpu_to_le32(0xC0000270)
-#define STATUS_VALIDATE_CONTINUE cpu_to_le32(0xC0000271)
-#define STATUS_NO_MATCH cpu_to_le32(0xC0000272)
-#define STATUS_NO_MORE_MATCHES cpu_to_le32(0xC0000273)
-#define STATUS_NOT_A_REPARSE_POINT cpu_to_le32(0xC0000275)
-#define STATUS_IO_REPARSE_TAG_INVALID cpu_to_le32(0xC0000276)
-#define STATUS_IO_REPARSE_TAG_MISMATCH cpu_to_le32(0xC0000277)
-#define STATUS_IO_REPARSE_DATA_INVALID cpu_to_le32(0xC0000278)
-#define STATUS_IO_REPARSE_TAG_NOT_HANDLED cpu_to_le32(0xC0000279)
-#define STATUS_REPARSE_POINT_NOT_RESOLVED cpu_to_le32(0xC0000280)
-#define STATUS_DIRECTORY_IS_A_REPARSE_POINT cpu_to_le32(0xC0000281)
-#define STATUS_RANGE_LIST_CONFLICT cpu_to_le32(0xC0000282)
-#define STATUS_SOURCE_ELEMENT_EMPTY cpu_to_le32(0xC0000283)
-#define STATUS_DESTINATION_ELEMENT_FULL cpu_to_le32(0xC0000284)
-#define STATUS_ILLEGAL_ELEMENT_ADDRESS cpu_to_le32(0xC0000285)
-#define STATUS_MAGAZINE_NOT_PRESENT cpu_to_le32(0xC0000286)
-#define STATUS_REINITIALIZATION_NEEDED cpu_to_le32(0xC0000287)
-#define STATUS_ENCRYPTION_FAILED cpu_to_le32(0xC000028A)
-#define STATUS_DECRYPTION_FAILED cpu_to_le32(0xC000028B)
-#define STATUS_RANGE_NOT_FOUND cpu_to_le32(0xC000028C)
-#define STATUS_NO_RECOVERY_POLICY cpu_to_le32(0xC000028D)
-#define STATUS_NO_EFS cpu_to_le32(0xC000028E)
-#define STATUS_WRONG_EFS cpu_to_le32(0xC000028F)
-#define STATUS_NO_USER_KEYS cpu_to_le32(0xC0000290)
-#define STATUS_FILE_NOT_ENCRYPTED cpu_to_le32(0xC0000291)
-#define STATUS_NOT_EXPORT_FORMAT cpu_to_le32(0xC0000292)
-#define STATUS_FILE_ENCRYPTED cpu_to_le32(0xC0000293)
-#define STATUS_WMI_GUID_NOT_FOUND cpu_to_le32(0xC0000295)
-#define STATUS_WMI_INSTANCE_NOT_FOUND cpu_to_le32(0xC0000296)
-#define STATUS_WMI_ITEMID_NOT_FOUND cpu_to_le32(0xC0000297)
-#define STATUS_WMI_TRY_AGAIN cpu_to_le32(0xC0000298)
-#define STATUS_SHARED_POLICY cpu_to_le32(0xC0000299)
-#define STATUS_POLICY_OBJECT_NOT_FOUND cpu_to_le32(0xC000029A)
-#define STATUS_POLICY_ONLY_IN_DS cpu_to_le32(0xC000029B)
-#define STATUS_VOLUME_NOT_UPGRADED cpu_to_le32(0xC000029C)
-#define STATUS_REMOTE_STORAGE_NOT_ACTIVE cpu_to_le32(0xC000029D)
-#define STATUS_REMOTE_STORAGE_MEDIA_ERROR cpu_to_le32(0xC000029E)
-#define STATUS_NO_TRACKING_SERVICE cpu_to_le32(0xC000029F)
-#define STATUS_SERVER_SID_MISMATCH cpu_to_le32(0xC00002A0)
-#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE cpu_to_le32(0xC00002A1)
-#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX cpu_to_le32(0xC00002A2)
-#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED cpu_to_le32(0xC00002A3)
-#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS cpu_to_le32(0xC00002A4)
-#define STATUS_DS_BUSY cpu_to_le32(0xC00002A5)
-#define STATUS_DS_UNAVAILABLE cpu_to_le32(0xC00002A6)
-#define STATUS_DS_NO_RIDS_ALLOCATED cpu_to_le32(0xC00002A7)
-#define STATUS_DS_NO_MORE_RIDS cpu_to_le32(0xC00002A8)
-#define STATUS_DS_INCORRECT_ROLE_OWNER cpu_to_le32(0xC00002A9)
-#define STATUS_DS_RIDMGR_INIT_ERROR cpu_to_le32(0xC00002AA)
-#define STATUS_DS_OBJ_CLASS_VIOLATION cpu_to_le32(0xC00002AB)
-#define STATUS_DS_CANT_ON_NON_LEAF cpu_to_le32(0xC00002AC)
-#define STATUS_DS_CANT_ON_RDN cpu_to_le32(0xC00002AD)
-#define STATUS_DS_CANT_MOD_OBJ_CLASS cpu_to_le32(0xC00002AE)
-#define STATUS_DS_CROSS_DOM_MOVE_FAILED cpu_to_le32(0xC00002AF)
-#define STATUS_DS_GC_NOT_AVAILABLE cpu_to_le32(0xC00002B0)
-#define STATUS_DIRECTORY_SERVICE_REQUIRED cpu_to_le32(0xC00002B1)
-#define STATUS_REPARSE_ATTRIBUTE_CONFLICT cpu_to_le32(0xC00002B2)
-#define STATUS_CANT_ENABLE_DENY_ONLY cpu_to_le32(0xC00002B3)
-#define STATUS_FLOAT_MULTIPLE_FAULTS cpu_to_le32(0xC00002B4)
-#define STATUS_FLOAT_MULTIPLE_TRAPS cpu_to_le32(0xC00002B5)
-#define STATUS_DEVICE_REMOVED cpu_to_le32(0xC00002B6)
-#define STATUS_JOURNAL_DELETE_IN_PROGRESS cpu_to_le32(0xC00002B7)
-#define STATUS_JOURNAL_NOT_ACTIVE cpu_to_le32(0xC00002B8)
-#define STATUS_NOINTERFACE cpu_to_le32(0xC00002B9)
-#define STATUS_DS_ADMIN_LIMIT_EXCEEDED cpu_to_le32(0xC00002C1)
-#define STATUS_DRIVER_FAILED_SLEEP cpu_to_le32(0xC00002C2)
-#define STATUS_MUTUAL_AUTHENTICATION_FAILED cpu_to_le32(0xC00002C3)
-#define STATUS_CORRUPT_SYSTEM_FILE cpu_to_le32(0xC00002C4)
-#define STATUS_DATATYPE_MISALIGNMENT_ERROR cpu_to_le32(0xC00002C5)
-#define STATUS_WMI_READ_ONLY cpu_to_le32(0xC00002C6)
-#define STATUS_WMI_SET_FAILURE cpu_to_le32(0xC00002C7)
-#define STATUS_COMMITMENT_MINIMUM cpu_to_le32(0xC00002C8)
-#define STATUS_REG_NAT_CONSUMPTION cpu_to_le32(0xC00002C9)
-#define STATUS_TRANSPORT_FULL cpu_to_le32(0xC00002CA)
-#define STATUS_DS_SAM_INIT_FAILURE cpu_to_le32(0xC00002CB)
-#define STATUS_ONLY_IF_CONNECTED cpu_to_le32(0xC00002CC)
-#define STATUS_DS_SENSITIVE_GROUP_VIOLATION cpu_to_le32(0xC00002CD)
-#define STATUS_PNP_RESTART_ENUMERATION cpu_to_le32(0xC00002CE)
-#define STATUS_JOURNAL_ENTRY_DELETED cpu_to_le32(0xC00002CF)
-#define STATUS_DS_CANT_MOD_PRIMARYGROUPID cpu_to_le32(0xC00002D0)
-#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE cpu_to_le32(0xC00002D1)
-#define STATUS_PNP_REBOOT_REQUIRED cpu_to_le32(0xC00002D2)
-#define STATUS_POWER_STATE_INVALID cpu_to_le32(0xC00002D3)
-#define STATUS_DS_INVALID_GROUP_TYPE cpu_to_le32(0xC00002D4)
-#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D5)
-#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D6)
-#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D7)
-#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC00002D8)
-#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D9)
-#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER cpu_to_le32(0xC00002DA)
-#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER \
- cpu_to_le32(0xC00002DB)
-#define STATUS_DS_HAVE_PRIMARY_MEMBERS cpu_to_le32(0xC00002DC)
-#define STATUS_WMI_NOT_SUPPORTED cpu_to_le32(0xC00002DD)
-#define STATUS_INSUFFICIENT_POWER cpu_to_le32(0xC00002DE)
-#define STATUS_SAM_NEED_BOOTKEY_PASSWORD cpu_to_le32(0xC00002DF)
-#define STATUS_SAM_NEED_BOOTKEY_FLOPPY cpu_to_le32(0xC00002E0)
-#define STATUS_DS_CANT_START cpu_to_le32(0xC00002E1)
-#define STATUS_DS_INIT_FAILURE cpu_to_le32(0xC00002E2)
-#define STATUS_SAM_INIT_FAILURE cpu_to_le32(0xC00002E3)
-#define STATUS_DS_GC_REQUIRED cpu_to_le32(0xC00002E4)
-#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY cpu_to_le32(0xC00002E5)
-#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS cpu_to_le32(0xC00002E6)
-#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED cpu_to_le32(0xC00002E7)
-#define STATUS_MULTIPLE_FAULT_VIOLATION cpu_to_le32(0xC00002E8)
-#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED cpu_to_le32(0xC00002E9)
-#define STATUS_CANNOT_MAKE cpu_to_le32(0xC00002EA)
-#define STATUS_SYSTEM_SHUTDOWN cpu_to_le32(0xC00002EB)
-#define STATUS_DS_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002EC)
-#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002ED)
-#define STATUS_UNFINISHED_CONTEXT_DELETED cpu_to_le32(0xC00002EE)
-#define STATUS_NO_TGT_REPLY cpu_to_le32(0xC00002EF)
-#define STATUS_OBJECTID_NOT_FOUND cpu_to_le32(0xC00002F0)
-#define STATUS_NO_IP_ADDRESSES cpu_to_le32(0xC00002F1)
-#define STATUS_WRONG_CREDENTIAL_HANDLE cpu_to_le32(0xC00002F2)
-#define STATUS_CRYPTO_SYSTEM_INVALID cpu_to_le32(0xC00002F3)
-#define STATUS_MAX_REFERRALS_EXCEEDED cpu_to_le32(0xC00002F4)
-#define STATUS_MUST_BE_KDC cpu_to_le32(0xC00002F5)
-#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED cpu_to_le32(0xC00002F6)
-#define STATUS_TOO_MANY_PRINCIPALS cpu_to_le32(0xC00002F7)
-#define STATUS_NO_PA_DATA cpu_to_le32(0xC00002F8)
-#define STATUS_PKINIT_NAME_MISMATCH cpu_to_le32(0xC00002F9)
-#define STATUS_SMARTCARD_LOGON_REQUIRED cpu_to_le32(0xC00002FA)
-#define STATUS_KDC_INVALID_REQUEST cpu_to_le32(0xC00002FB)
-#define STATUS_KDC_UNABLE_TO_REFER cpu_to_le32(0xC00002FC)
-#define STATUS_KDC_UNKNOWN_ETYPE cpu_to_le32(0xC00002FD)
-#define STATUS_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FE)
-#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FF)
-#define STATUS_NOT_SUPPORTED_ON_SBS cpu_to_le32(0xC0000300)
-#define STATUS_WMI_GUID_DISCONNECTED cpu_to_le32(0xC0000301)
-#define STATUS_WMI_ALREADY_DISABLED cpu_to_le32(0xC0000302)
-#define STATUS_WMI_ALREADY_ENABLED cpu_to_le32(0xC0000303)
-#define STATUS_MFT_TOO_FRAGMENTED cpu_to_le32(0xC0000304)
-#define STATUS_COPY_PROTECTION_FAILURE cpu_to_le32(0xC0000305)
-#define STATUS_CSS_AUTHENTICATION_FAILURE cpu_to_le32(0xC0000306)
-#define STATUS_CSS_KEY_NOT_PRESENT cpu_to_le32(0xC0000307)
-#define STATUS_CSS_KEY_NOT_ESTABLISHED cpu_to_le32(0xC0000308)
-#define STATUS_CSS_SCRAMBLED_SECTOR cpu_to_le32(0xC0000309)
-#define STATUS_CSS_REGION_MISMATCH cpu_to_le32(0xC000030A)
-#define STATUS_CSS_RESETS_EXHAUSTED cpu_to_le32(0xC000030B)
-#define STATUS_PKINIT_FAILURE cpu_to_le32(0xC0000320)
-#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE cpu_to_le32(0xC0000321)
-#define STATUS_NO_KERB_KEY cpu_to_le32(0xC0000322)
-#define STATUS_HOST_DOWN cpu_to_le32(0xC0000350)
-#define STATUS_UNSUPPORTED_PREAUTH cpu_to_le32(0xC0000351)
-#define STATUS_EFS_ALG_BLOB_TOO_BIG cpu_to_le32(0xC0000352)
-#define STATUS_PORT_NOT_SET cpu_to_le32(0xC0000353)
-#define STATUS_DEBUGGER_INACTIVE cpu_to_le32(0xC0000354)
-#define STATUS_DS_VERSION_CHECK_FAILURE cpu_to_le32(0xC0000355)
-#define STATUS_AUDITING_DISABLED cpu_to_le32(0xC0000356)
-#define STATUS_PRENT4_MACHINE_ACCOUNT cpu_to_le32(0xC0000357)
-#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC0000358)
-#define STATUS_INVALID_IMAGE_WIN_32 cpu_to_le32(0xC0000359)
-#define STATUS_INVALID_IMAGE_WIN_64 cpu_to_le32(0xC000035A)
-#define STATUS_BAD_BINDINGS cpu_to_le32(0xC000035B)
-#define STATUS_NETWORK_SESSION_EXPIRED cpu_to_le32(0xC000035C)
-#define STATUS_APPHELP_BLOCK cpu_to_le32(0xC000035D)
-#define STATUS_ALL_SIDS_FILTERED cpu_to_le32(0xC000035E)
-#define STATUS_NOT_SAFE_MODE_DRIVER cpu_to_le32(0xC000035F)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT cpu_to_le32(0xC0000361)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH cpu_to_le32(0xC0000362)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER cpu_to_le32(0xC0000363)
-#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER cpu_to_le32(0xC0000364)
-#define STATUS_FAILED_DRIVER_ENTRY cpu_to_le32(0xC0000365)
-#define STATUS_DEVICE_ENUMERATION_ERROR cpu_to_le32(0xC0000366)
-#define STATUS_MOUNT_POINT_NOT_RESOLVED cpu_to_le32(0xC0000368)
-#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER cpu_to_le32(0xC0000369)
-#define STATUS_MCA_OCCURRED cpu_to_le32(0xC000036A)
-#define STATUS_DRIVER_BLOCKED_CRITICAL cpu_to_le32(0xC000036B)
-#define STATUS_DRIVER_BLOCKED cpu_to_le32(0xC000036C)
-#define STATUS_DRIVER_DATABASE_ERROR cpu_to_le32(0xC000036D)
-#define STATUS_SYSTEM_HIVE_TOO_LARGE cpu_to_le32(0xC000036E)
-#define STATUS_INVALID_IMPORT_OF_NON_DLL cpu_to_le32(0xC000036F)
-#define STATUS_NO_SECRETS cpu_to_le32(0xC0000371)
-#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY cpu_to_le32(0xC0000372)
-#define STATUS_FAILED_STACK_SWITCH cpu_to_le32(0xC0000373)
-#define STATUS_HEAP_CORRUPTION cpu_to_le32(0xC0000374)
-#define STATUS_SMARTCARD_WRONG_PIN cpu_to_le32(0xC0000380)
-#define STATUS_SMARTCARD_CARD_BLOCKED cpu_to_le32(0xC0000381)
-#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED cpu_to_le32(0xC0000382)
-#define STATUS_SMARTCARD_NO_CARD cpu_to_le32(0xC0000383)
-#define STATUS_SMARTCARD_NO_KEY_CONTAINER cpu_to_le32(0xC0000384)
-#define STATUS_SMARTCARD_NO_CERTIFICATE cpu_to_le32(0xC0000385)
-#define STATUS_SMARTCARD_NO_KEYSET cpu_to_le32(0xC0000386)
-#define STATUS_SMARTCARD_IO_ERROR cpu_to_le32(0xC0000387)
-#define STATUS_DOWNGRADE_DETECTED cpu_to_le32(0xC0000388)
-#define STATUS_SMARTCARD_CERT_REVOKED cpu_to_le32(0xC0000389)
-#define STATUS_ISSUING_CA_UNTRUSTED cpu_to_le32(0xC000038A)
-#define STATUS_REVOCATION_OFFLINE_C cpu_to_le32(0xC000038B)
-#define STATUS_PKINIT_CLIENT_FAILURE cpu_to_le32(0xC000038C)
-#define STATUS_SMARTCARD_CERT_EXPIRED cpu_to_le32(0xC000038D)
-#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD cpu_to_le32(0xC000038E)
-#define STATUS_SMARTCARD_SILENT_CONTEXT cpu_to_le32(0xC000038F)
-#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000401)
-#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000402)
-#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000403)
-#define STATUS_DS_NAME_NOT_UNIQUE cpu_to_le32(0xC0000404)
-#define STATUS_DS_DUPLICATE_ID_FOUND cpu_to_le32(0xC0000405)
-#define STATUS_DS_GROUP_CONVERSION_ERROR cpu_to_le32(0xC0000406)
-#define STATUS_VOLSNAP_PREPARE_HIBERNATE cpu_to_le32(0xC0000407)
-#define STATUS_USER2USER_REQUIRED cpu_to_le32(0xC0000408)
-#define STATUS_STACK_BUFFER_OVERRUN cpu_to_le32(0xC0000409)
-#define STATUS_NO_S4U_PROT_SUPPORT cpu_to_le32(0xC000040A)
-#define STATUS_CROSSREALM_DELEGATION_FAILURE cpu_to_le32(0xC000040B)
-#define STATUS_REVOCATION_OFFLINE_KDC cpu_to_le32(0xC000040C)
-#define STATUS_ISSUING_CA_UNTRUSTED_KDC cpu_to_le32(0xC000040D)
-#define STATUS_KDC_CERT_EXPIRED cpu_to_le32(0xC000040E)
-#define STATUS_KDC_CERT_REVOKED cpu_to_le32(0xC000040F)
-#define STATUS_PARAMETER_QUOTA_EXCEEDED cpu_to_le32(0xC0000410)
-#define STATUS_HIBERNATION_FAILURE cpu_to_le32(0xC0000411)
-#define STATUS_DELAY_LOAD_FAILED cpu_to_le32(0xC0000412)
-#define STATUS_AUTHENTICATION_FIREWALL_FAILED cpu_to_le32(0xC0000413)
-#define STATUS_VDM_DISALLOWED cpu_to_le32(0xC0000414)
-#define STATUS_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC0000415)
-#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE \
- cpu_to_le32(0xC0000416)
-#define STATUS_INVALID_CRUNTIME_PARAMETER cpu_to_le32(0xC0000417)
-#define STATUS_NTLM_BLOCKED cpu_to_le32(0xC0000418)
-#define STATUS_ASSERTION_FAILURE cpu_to_le32(0xC0000420)
-#define STATUS_VERIFIER_STOP cpu_to_le32(0xC0000421)
-#define STATUS_CALLBACK_POP_STACK cpu_to_le32(0xC0000423)
-#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED cpu_to_le32(0xC0000424)
-#define STATUS_HIVE_UNLOADED cpu_to_le32(0xC0000425)
-#define STATUS_COMPRESSION_DISABLED cpu_to_le32(0xC0000426)
-#define STATUS_FILE_SYSTEM_LIMITATION cpu_to_le32(0xC0000427)
-#define STATUS_INVALID_IMAGE_HASH cpu_to_le32(0xC0000428)
-#define STATUS_NOT_CAPABLE cpu_to_le32(0xC0000429)
-#define STATUS_REQUEST_OUT_OF_SEQUENCE cpu_to_le32(0xC000042A)
-#define STATUS_IMPLEMENTATION_LIMIT cpu_to_le32(0xC000042B)
-#define STATUS_ELEVATION_REQUIRED cpu_to_le32(0xC000042C)
-#define STATUS_BEYOND_VDL cpu_to_le32(0xC0000432)
-#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS cpu_to_le32(0xC0000433)
-#define STATUS_PTE_CHANGED cpu_to_le32(0xC0000434)
-#define STATUS_PURGE_FAILED cpu_to_le32(0xC0000435)
-#define STATUS_CRED_REQUIRES_CONFIRMATION cpu_to_le32(0xC0000440)
-#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE cpu_to_le32(0xC0000441)
-#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER cpu_to_le32(0xC0000442)
-#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE cpu_to_le32(0xC0000443)
-#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE cpu_to_le32(0xC0000444)
-#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE cpu_to_le32(0xC0000445)
-#define STATUS_INVALID_LABEL cpu_to_le32(0xC0000446)
-#define STATUS_DRIVER_PROCESS_TERMINATED cpu_to_le32(0xC0000450)
-#define STATUS_AMBIGUOUS_SYSTEM_DEVICE cpu_to_le32(0xC0000451)
-#define STATUS_SYSTEM_DEVICE_NOT_FOUND cpu_to_le32(0xC0000452)
-#define STATUS_RESTART_BOOT_APPLICATION cpu_to_le32(0xC0000453)
-#define STATUS_INVALID_TASK_NAME cpu_to_le32(0xC0000500)
-#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501)
-#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502)
-#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503)
-#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700)
-#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701)
-#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702)
-#define STATUS_REQUEST_CANCELED cpu_to_le32(0xC0000703)
-#define STATUS_RECURSIVE_DISPATCH cpu_to_le32(0xC0000704)
-#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED cpu_to_le32(0xC0000705)
-#define STATUS_LPC_INVALID_CONNECTION_USAGE cpu_to_le32(0xC0000706)
-#define STATUS_LPC_REQUESTS_NOT_ALLOWED cpu_to_le32(0xC0000707)
-#define STATUS_RESOURCE_IN_USE cpu_to_le32(0xC0000708)
-#define STATUS_HARDWARE_MEMORY_ERROR cpu_to_le32(0xC0000709)
-#define STATUS_THREADPOOL_HANDLE_EXCEPTION cpu_to_le32(0xC000070A)
-#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED cpu_to_le32(0xC000070B)
-#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED \
- cpu_to_le32(0xC000070C)
-#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED \
- cpu_to_le32(0xC000070D)
-#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED \
- cpu_to_le32(0xC000070E)
-#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION cpu_to_le32(0xC000070F)
-#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000710)
-#define STATUS_APC_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000711)
-#define STATUS_PROCESS_IS_PROTECTED cpu_to_le32(0xC0000712)
-#define STATUS_MCA_EXCEPTION cpu_to_le32(0xC0000713)
-#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE cpu_to_le32(0xC0000714)
-#define STATUS_SYMLINK_CLASS_DISABLED cpu_to_le32(0xC0000715)
-#define STATUS_INVALID_IDN_NORMALIZATION cpu_to_le32(0xC0000716)
-#define STATUS_NO_UNICODE_TRANSLATION cpu_to_le32(0xC0000717)
-#define STATUS_ALREADY_REGISTERED cpu_to_le32(0xC0000718)
-#define STATUS_CONTEXT_MISMATCH cpu_to_le32(0xC0000719)
-#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST cpu_to_le32(0xC000071A)
-#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY cpu_to_le32(0xC000071B)
-#define STATUS_INVALID_THREAD cpu_to_le32(0xC000071C)
-#define STATUS_CALLBACK_RETURNED_TRANSACTION cpu_to_le32(0xC000071D)
-#define STATUS_CALLBACK_RETURNED_LDR_LOCK cpu_to_le32(0xC000071E)
-#define STATUS_CALLBACK_RETURNED_LANG cpu_to_le32(0xC000071F)
-#define STATUS_CALLBACK_RETURNED_PRI_BACK cpu_to_le32(0xC0000720)
-#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY cpu_to_le32(0xC0000721)
-#define STATUS_DISK_REPAIR_DISABLED cpu_to_le32(0xC0000800)
-#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS cpu_to_le32(0xC0000801)
-#define STATUS_DISK_QUOTA_EXCEEDED cpu_to_le32(0xC0000802)
-#define STATUS_CONTENT_BLOCKED cpu_to_le32(0xC0000804)
-#define STATUS_BAD_CLUSTERS cpu_to_le32(0xC0000805)
-#define STATUS_VOLUME_DIRTY cpu_to_le32(0xC0000806)
-#define STATUS_FILE_CHECKED_OUT cpu_to_le32(0xC0000901)
-#define STATUS_CHECKOUT_REQUIRED cpu_to_le32(0xC0000902)
-#define STATUS_BAD_FILE_TYPE cpu_to_le32(0xC0000903)
-#define STATUS_FILE_TOO_LARGE cpu_to_le32(0xC0000904)
-#define STATUS_FORMS_AUTH_REQUIRED cpu_to_le32(0xC0000905)
-#define STATUS_VIRUS_INFECTED cpu_to_le32(0xC0000906)
-#define STATUS_VIRUS_DELETED cpu_to_le32(0xC0000907)
-#define STATUS_BAD_MCFG_TABLE cpu_to_le32(0xC0000908)
-#define STATUS_WOW_ASSERTION cpu_to_le32(0xC0009898)
-#define STATUS_INVALID_SIGNATURE cpu_to_le32(0xC000A000)
-#define STATUS_HMAC_NOT_SUPPORTED cpu_to_le32(0xC000A001)
-#define STATUS_IPSEC_QUEUE_OVERFLOW cpu_to_le32(0xC000A010)
-#define STATUS_ND_QUEUE_OVERFLOW cpu_to_le32(0xC000A011)
-#define STATUS_HOPLIMIT_EXCEEDED cpu_to_le32(0xC000A012)
-#define STATUS_PROTOCOL_NOT_SUPPORTED cpu_to_le32(0xC000A013)
-#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED \
- cpu_to_le32(0xC000A080)
-#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR \
- cpu_to_le32(0xC000A081)
-#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR cpu_to_le32(0xC000A082)
-#define STATUS_XML_PARSE_ERROR cpu_to_le32(0xC000A083)
-#define STATUS_XMLDSIG_ERROR cpu_to_le32(0xC000A084)
-#define STATUS_WRONG_COMPARTMENT cpu_to_le32(0xC000A085)
-#define STATUS_AUTHIP_FAILURE cpu_to_le32(0xC000A086)
-#define DBG_NO_STATE_CHANGE cpu_to_le32(0xC0010001)
-#define DBG_APP_NOT_IDLE cpu_to_le32(0xC0010002)
-#define RPC_NT_INVALID_STRING_BINDING cpu_to_le32(0xC0020001)
-#define RPC_NT_WRONG_KIND_OF_BINDING cpu_to_le32(0xC0020002)
-#define RPC_NT_INVALID_BINDING cpu_to_le32(0xC0020003)
-#define RPC_NT_PROTSEQ_NOT_SUPPORTED cpu_to_le32(0xC0020004)
-#define RPC_NT_INVALID_RPC_PROTSEQ cpu_to_le32(0xC0020005)
-#define RPC_NT_INVALID_STRING_UUID cpu_to_le32(0xC0020006)
-#define RPC_NT_INVALID_ENDPOINT_FORMAT cpu_to_le32(0xC0020007)
-#define RPC_NT_INVALID_NET_ADDR cpu_to_le32(0xC0020008)
-#define RPC_NT_NO_ENDPOINT_FOUND cpu_to_le32(0xC0020009)
-#define RPC_NT_INVALID_TIMEOUT cpu_to_le32(0xC002000A)
-#define RPC_NT_OBJECT_NOT_FOUND cpu_to_le32(0xC002000B)
-#define RPC_NT_ALREADY_REGISTERED cpu_to_le32(0xC002000C)
-#define RPC_NT_TYPE_ALREADY_REGISTERED cpu_to_le32(0xC002000D)
-#define RPC_NT_ALREADY_LISTENING cpu_to_le32(0xC002000E)
-#define RPC_NT_NO_PROTSEQS_REGISTERED cpu_to_le32(0xC002000F)
-#define RPC_NT_NOT_LISTENING cpu_to_le32(0xC0020010)
-#define RPC_NT_UNKNOWN_MGR_TYPE cpu_to_le32(0xC0020011)
-#define RPC_NT_UNKNOWN_IF cpu_to_le32(0xC0020012)
-#define RPC_NT_NO_BINDINGS cpu_to_le32(0xC0020013)
-#define RPC_NT_NO_PROTSEQS cpu_to_le32(0xC0020014)
-#define RPC_NT_CANT_CREATE_ENDPOINT cpu_to_le32(0xC0020015)
-#define RPC_NT_OUT_OF_RESOURCES cpu_to_le32(0xC0020016)
-#define RPC_NT_SERVER_UNAVAILABLE cpu_to_le32(0xC0020017)
-#define RPC_NT_SERVER_TOO_BUSY cpu_to_le32(0xC0020018)
-#define RPC_NT_INVALID_NETWORK_OPTIONS cpu_to_le32(0xC0020019)
-#define RPC_NT_NO_CALL_ACTIVE cpu_to_le32(0xC002001A)
-#define RPC_NT_CALL_FAILED cpu_to_le32(0xC002001B)
-#define RPC_NT_CALL_FAILED_DNE cpu_to_le32(0xC002001C)
-#define RPC_NT_PROTOCOL_ERROR cpu_to_le32(0xC002001D)
-#define RPC_NT_UNSUPPORTED_TRANS_SYN cpu_to_le32(0xC002001F)
-#define RPC_NT_UNSUPPORTED_TYPE cpu_to_le32(0xC0020021)
-#define RPC_NT_INVALID_TAG cpu_to_le32(0xC0020022)
-#define RPC_NT_INVALID_BOUND cpu_to_le32(0xC0020023)
-#define RPC_NT_NO_ENTRY_NAME cpu_to_le32(0xC0020024)
-#define RPC_NT_INVALID_NAME_SYNTAX cpu_to_le32(0xC0020025)
-#define RPC_NT_UNSUPPORTED_NAME_SYNTAX cpu_to_le32(0xC0020026)
-#define RPC_NT_UUID_NO_ADDRESS cpu_to_le32(0xC0020028)
-#define RPC_NT_DUPLICATE_ENDPOINT cpu_to_le32(0xC0020029)
-#define RPC_NT_UNKNOWN_AUTHN_TYPE cpu_to_le32(0xC002002A)
-#define RPC_NT_MAX_CALLS_TOO_SMALL cpu_to_le32(0xC002002B)
-#define RPC_NT_STRING_TOO_LONG cpu_to_le32(0xC002002C)
-#define RPC_NT_PROTSEQ_NOT_FOUND cpu_to_le32(0xC002002D)
-#define RPC_NT_PROCNUM_OUT_OF_RANGE cpu_to_le32(0xC002002E)
-#define RPC_NT_BINDING_HAS_NO_AUTH cpu_to_le32(0xC002002F)
-#define RPC_NT_UNKNOWN_AUTHN_SERVICE cpu_to_le32(0xC0020030)
-#define RPC_NT_UNKNOWN_AUTHN_LEVEL cpu_to_le32(0xC0020031)
-#define RPC_NT_INVALID_AUTH_IDENTITY cpu_to_le32(0xC0020032)
-#define RPC_NT_UNKNOWN_AUTHZ_SERVICE cpu_to_le32(0xC0020033)
-#define EPT_NT_INVALID_ENTRY cpu_to_le32(0xC0020034)
-#define EPT_NT_CANT_PERFORM_OP cpu_to_le32(0xC0020035)
-#define EPT_NT_NOT_REGISTERED cpu_to_le32(0xC0020036)
-#define RPC_NT_NOTHING_TO_EXPORT cpu_to_le32(0xC0020037)
-#define RPC_NT_INCOMPLETE_NAME cpu_to_le32(0xC0020038)
-#define RPC_NT_INVALID_VERS_OPTION cpu_to_le32(0xC0020039)
-#define RPC_NT_NO_MORE_MEMBERS cpu_to_le32(0xC002003A)
-#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED cpu_to_le32(0xC002003B)
-#define RPC_NT_INTERFACE_NOT_FOUND cpu_to_le32(0xC002003C)
-#define RPC_NT_ENTRY_ALREADY_EXISTS cpu_to_le32(0xC002003D)
-#define RPC_NT_ENTRY_NOT_FOUND cpu_to_le32(0xC002003E)
-#define RPC_NT_NAME_SERVICE_UNAVAILABLE cpu_to_le32(0xC002003F)
-#define RPC_NT_INVALID_NAF_ID cpu_to_le32(0xC0020040)
-#define RPC_NT_CANNOT_SUPPORT cpu_to_le32(0xC0020041)
-#define RPC_NT_NO_CONTEXT_AVAILABLE cpu_to_le32(0xC0020042)
-#define RPC_NT_INTERNAL_ERROR cpu_to_le32(0xC0020043)
-#define RPC_NT_ZERO_DIVIDE cpu_to_le32(0xC0020044)
-#define RPC_NT_ADDRESS_ERROR cpu_to_le32(0xC0020045)
-#define RPC_NT_FP_DIV_ZERO cpu_to_le32(0xC0020046)
-#define RPC_NT_FP_UNDERFLOW cpu_to_le32(0xC0020047)
-#define RPC_NT_FP_OVERFLOW cpu_to_le32(0xC0020048)
-#define RPC_NT_CALL_IN_PROGRESS cpu_to_le32(0xC0020049)
-#define RPC_NT_NO_MORE_BINDINGS cpu_to_le32(0xC002004A)
-#define RPC_NT_GROUP_MEMBER_NOT_FOUND cpu_to_le32(0xC002004B)
-#define EPT_NT_CANT_CREATE cpu_to_le32(0xC002004C)
-#define RPC_NT_INVALID_OBJECT cpu_to_le32(0xC002004D)
-#define RPC_NT_NO_INTERFACES cpu_to_le32(0xC002004F)
-#define RPC_NT_CALL_CANCELLED cpu_to_le32(0xC0020050)
-#define RPC_NT_BINDING_INCOMPLETE cpu_to_le32(0xC0020051)
-#define RPC_NT_COMM_FAILURE cpu_to_le32(0xC0020052)
-#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL cpu_to_le32(0xC0020053)
-#define RPC_NT_NO_PRINC_NAME cpu_to_le32(0xC0020054)
-#define RPC_NT_NOT_RPC_ERROR cpu_to_le32(0xC0020055)
-#define RPC_NT_SEC_PKG_ERROR cpu_to_le32(0xC0020057)
-#define RPC_NT_NOT_CANCELLED cpu_to_le32(0xC0020058)
-#define RPC_NT_INVALID_ASYNC_HANDLE cpu_to_le32(0xC0020062)
-#define RPC_NT_INVALID_ASYNC_CALL cpu_to_le32(0xC0020063)
-#define RPC_NT_PROXY_ACCESS_DENIED cpu_to_le32(0xC0020064)
-#define RPC_NT_NO_MORE_ENTRIES cpu_to_le32(0xC0030001)
-#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL cpu_to_le32(0xC0030002)
-#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE cpu_to_le32(0xC0030003)
-#define RPC_NT_SS_IN_NULL_CONTEXT cpu_to_le32(0xC0030004)
-#define RPC_NT_SS_CONTEXT_MISMATCH cpu_to_le32(0xC0030005)
-#define RPC_NT_SS_CONTEXT_DAMAGED cpu_to_le32(0xC0030006)
-#define RPC_NT_SS_HANDLES_MISMATCH cpu_to_le32(0xC0030007)
-#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE cpu_to_le32(0xC0030008)
-#define RPC_NT_NULL_REF_POINTER cpu_to_le32(0xC0030009)
-#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE cpu_to_le32(0xC003000A)
-#define RPC_NT_BYTE_COUNT_TOO_SMALL cpu_to_le32(0xC003000B)
-#define RPC_NT_BAD_STUB_DATA cpu_to_le32(0xC003000C)
-#define RPC_NT_INVALID_ES_ACTION cpu_to_le32(0xC0030059)
-#define RPC_NT_WRONG_ES_VERSION cpu_to_le32(0xC003005A)
-#define RPC_NT_WRONG_STUB_VERSION cpu_to_le32(0xC003005B)
-#define RPC_NT_INVALID_PIPE_OBJECT cpu_to_le32(0xC003005C)
-#define RPC_NT_INVALID_PIPE_OPERATION cpu_to_le32(0xC003005D)
-#define RPC_NT_WRONG_PIPE_VERSION cpu_to_le32(0xC003005E)
-#define RPC_NT_PIPE_CLOSED cpu_to_le32(0xC003005F)
-#define RPC_NT_PIPE_DISCIPLINE_ERROR cpu_to_le32(0xC0030060)
-#define RPC_NT_PIPE_EMPTY cpu_to_le32(0xC0030061)
-#define STATUS_PNP_BAD_MPS_TABLE cpu_to_le32(0xC0040035)
-#define STATUS_PNP_TRANSLATION_FAILED cpu_to_le32(0xC0040036)
-#define STATUS_PNP_IRQ_TRANSLATION_FAILED cpu_to_le32(0xC0040037)
-#define STATUS_PNP_INVALID_ID cpu_to_le32(0xC0040038)
-#define STATUS_IO_REISSUE_AS_CACHED cpu_to_le32(0xC0040039)
-#define STATUS_CTX_WINSTATION_NAME_INVALID cpu_to_le32(0xC00A0001)
-#define STATUS_CTX_INVALID_PD cpu_to_le32(0xC00A0002)
-#define STATUS_CTX_PD_NOT_FOUND cpu_to_le32(0xC00A0003)
-#define STATUS_CTX_CLOSE_PENDING cpu_to_le32(0xC00A0006)
-#define STATUS_CTX_NO_OUTBUF cpu_to_le32(0xC00A0007)
-#define STATUS_CTX_MODEM_INF_NOT_FOUND cpu_to_le32(0xC00A0008)
-#define STATUS_CTX_INVALID_MODEMNAME cpu_to_le32(0xC00A0009)
-#define STATUS_CTX_RESPONSE_ERROR cpu_to_le32(0xC00A000A)
-#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT cpu_to_le32(0xC00A000B)
-#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER cpu_to_le32(0xC00A000C)
-#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE cpu_to_le32(0xC00A000D)
-#define STATUS_CTX_MODEM_RESPONSE_BUSY cpu_to_le32(0xC00A000E)
-#define STATUS_CTX_MODEM_RESPONSE_VOICE cpu_to_le32(0xC00A000F)
-#define STATUS_CTX_TD_ERROR cpu_to_le32(0xC00A0010)
-#define STATUS_CTX_LICENSE_CLIENT_INVALID cpu_to_le32(0xC00A0012)
-#define STATUS_CTX_LICENSE_NOT_AVAILABLE cpu_to_le32(0xC00A0013)
-#define STATUS_CTX_LICENSE_EXPIRED cpu_to_le32(0xC00A0014)
-#define STATUS_CTX_WINSTATION_NOT_FOUND cpu_to_le32(0xC00A0015)
-#define STATUS_CTX_WINSTATION_NAME_COLLISION cpu_to_le32(0xC00A0016)
-#define STATUS_CTX_WINSTATION_BUSY cpu_to_le32(0xC00A0017)
-#define STATUS_CTX_BAD_VIDEO_MODE cpu_to_le32(0xC00A0018)
-#define STATUS_CTX_GRAPHICS_INVALID cpu_to_le32(0xC00A0022)
-#define STATUS_CTX_NOT_CONSOLE cpu_to_le32(0xC00A0024)
-#define STATUS_CTX_CLIENT_QUERY_TIMEOUT cpu_to_le32(0xC00A0026)
-#define STATUS_CTX_CONSOLE_DISCONNECT cpu_to_le32(0xC00A0027)
-#define STATUS_CTX_CONSOLE_CONNECT cpu_to_le32(0xC00A0028)
-#define STATUS_CTX_SHADOW_DENIED cpu_to_le32(0xC00A002A)
-#define STATUS_CTX_WINSTATION_ACCESS_DENIED cpu_to_le32(0xC00A002B)
-#define STATUS_CTX_INVALID_WD cpu_to_le32(0xC00A002E)
-#define STATUS_CTX_WD_NOT_FOUND cpu_to_le32(0xC00A002F)
-#define STATUS_CTX_SHADOW_INVALID cpu_to_le32(0xC00A0030)
-#define STATUS_CTX_SHADOW_DISABLED cpu_to_le32(0xC00A0031)
-#define STATUS_RDP_PROTOCOL_ERROR cpu_to_le32(0xC00A0032)
-#define STATUS_CTX_CLIENT_LICENSE_NOT_SET cpu_to_le32(0xC00A0033)
-#define STATUS_CTX_CLIENT_LICENSE_IN_USE cpu_to_le32(0xC00A0034)
-#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE cpu_to_le32(0xC00A0035)
-#define STATUS_CTX_SHADOW_NOT_RUNNING cpu_to_le32(0xC00A0036)
-#define STATUS_CTX_LOGON_DISABLED cpu_to_le32(0xC00A0037)
-#define STATUS_CTX_SECURITY_LAYER_ERROR cpu_to_le32(0xC00A0038)
-#define STATUS_TS_INCOMPATIBLE_SESSIONS cpu_to_le32(0xC00A0039)
-#define STATUS_MUI_FILE_NOT_FOUND cpu_to_le32(0xC00B0001)
-#define STATUS_MUI_INVALID_FILE cpu_to_le32(0xC00B0002)
-#define STATUS_MUI_INVALID_RC_CONFIG cpu_to_le32(0xC00B0003)
-#define STATUS_MUI_INVALID_LOCALE_NAME cpu_to_le32(0xC00B0004)
-#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME cpu_to_le32(0xC00B0005)
-#define STATUS_MUI_FILE_NOT_LOADED cpu_to_le32(0xC00B0006)
-#define STATUS_RESOURCE_ENUM_USER_STOP cpu_to_le32(0xC00B0007)
-#define STATUS_CLUSTER_INVALID_NODE cpu_to_le32(0xC0130001)
-#define STATUS_CLUSTER_NODE_EXISTS cpu_to_le32(0xC0130002)
-#define STATUS_CLUSTER_JOIN_IN_PROGRESS cpu_to_le32(0xC0130003)
-#define STATUS_CLUSTER_NODE_NOT_FOUND cpu_to_le32(0xC0130004)
-#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND cpu_to_le32(0xC0130005)
-#define STATUS_CLUSTER_NETWORK_EXISTS cpu_to_le32(0xC0130006)
-#define STATUS_CLUSTER_NETWORK_NOT_FOUND cpu_to_le32(0xC0130007)
-#define STATUS_CLUSTER_NETINTERFACE_EXISTS cpu_to_le32(0xC0130008)
-#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND cpu_to_le32(0xC0130009)
-#define STATUS_CLUSTER_INVALID_REQUEST cpu_to_le32(0xC013000A)
-#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER cpu_to_le32(0xC013000B)
-#define STATUS_CLUSTER_NODE_DOWN cpu_to_le32(0xC013000C)
-#define STATUS_CLUSTER_NODE_UNREACHABLE cpu_to_le32(0xC013000D)
-#define STATUS_CLUSTER_NODE_NOT_MEMBER cpu_to_le32(0xC013000E)
-#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS cpu_to_le32(0xC013000F)
-#define STATUS_CLUSTER_INVALID_NETWORK cpu_to_le32(0xC0130010)
-#define STATUS_CLUSTER_NO_NET_ADAPTERS cpu_to_le32(0xC0130011)
-#define STATUS_CLUSTER_NODE_UP cpu_to_le32(0xC0130012)
-#define STATUS_CLUSTER_NODE_PAUSED cpu_to_le32(0xC0130013)
-#define STATUS_CLUSTER_NODE_NOT_PAUSED cpu_to_le32(0xC0130014)
-#define STATUS_CLUSTER_NO_SECURITY_CONTEXT cpu_to_le32(0xC0130015)
-#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL cpu_to_le32(0xC0130016)
-#define STATUS_CLUSTER_POISONED cpu_to_le32(0xC0130017)
-#define STATUS_ACPI_INVALID_OPCODE cpu_to_le32(0xC0140001)
-#define STATUS_ACPI_STACK_OVERFLOW cpu_to_le32(0xC0140002)
-#define STATUS_ACPI_ASSERT_FAILED cpu_to_le32(0xC0140003)
-#define STATUS_ACPI_INVALID_INDEX cpu_to_le32(0xC0140004)
-#define STATUS_ACPI_INVALID_ARGUMENT cpu_to_le32(0xC0140005)
-#define STATUS_ACPI_FATAL cpu_to_le32(0xC0140006)
-#define STATUS_ACPI_INVALID_SUPERNAME cpu_to_le32(0xC0140007)
-#define STATUS_ACPI_INVALID_ARGTYPE cpu_to_le32(0xC0140008)
-#define STATUS_ACPI_INVALID_OBJTYPE cpu_to_le32(0xC0140009)
-#define STATUS_ACPI_INVALID_TARGETTYPE cpu_to_le32(0xC014000A)
-#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT cpu_to_le32(0xC014000B)
-#define STATUS_ACPI_ADDRESS_NOT_MAPPED cpu_to_le32(0xC014000C)
-#define STATUS_ACPI_INVALID_EVENTTYPE cpu_to_le32(0xC014000D)
-#define STATUS_ACPI_HANDLER_COLLISION cpu_to_le32(0xC014000E)
-#define STATUS_ACPI_INVALID_DATA cpu_to_le32(0xC014000F)
-#define STATUS_ACPI_INVALID_REGION cpu_to_le32(0xC0140010)
-#define STATUS_ACPI_INVALID_ACCESS_SIZE cpu_to_le32(0xC0140011)
-#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK cpu_to_le32(0xC0140012)
-#define STATUS_ACPI_ALREADY_INITIALIZED cpu_to_le32(0xC0140013)
-#define STATUS_ACPI_NOT_INITIALIZED cpu_to_le32(0xC0140014)
-#define STATUS_ACPI_INVALID_MUTEX_LEVEL cpu_to_le32(0xC0140015)
-#define STATUS_ACPI_MUTEX_NOT_OWNED cpu_to_le32(0xC0140016)
-#define STATUS_ACPI_MUTEX_NOT_OWNER cpu_to_le32(0xC0140017)
-#define STATUS_ACPI_RS_ACCESS cpu_to_le32(0xC0140018)
-#define STATUS_ACPI_INVALID_TABLE cpu_to_le32(0xC0140019)
-#define STATUS_ACPI_REG_HANDLER_FAILED cpu_to_le32(0xC0140020)
-#define STATUS_ACPI_POWER_REQUEST_FAILED cpu_to_le32(0xC0140021)
-#define STATUS_SXS_SECTION_NOT_FOUND cpu_to_le32(0xC0150001)
-#define STATUS_SXS_CANT_GEN_ACTCTX cpu_to_le32(0xC0150002)
-#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT cpu_to_le32(0xC0150003)
-#define STATUS_SXS_ASSEMBLY_NOT_FOUND cpu_to_le32(0xC0150004)
-#define STATUS_SXS_MANIFEST_FORMAT_ERROR cpu_to_le32(0xC0150005)
-#define STATUS_SXS_MANIFEST_PARSE_ERROR cpu_to_le32(0xC0150006)
-#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED cpu_to_le32(0xC0150007)
-#define STATUS_SXS_KEY_NOT_FOUND cpu_to_le32(0xC0150008)
-#define STATUS_SXS_VERSION_CONFLICT cpu_to_le32(0xC0150009)
-#define STATUS_SXS_WRONG_SECTION_TYPE cpu_to_le32(0xC015000A)
-#define STATUS_SXS_THREAD_QUERIES_DISABLED cpu_to_le32(0xC015000B)
-#define STATUS_SXS_ASSEMBLY_MISSING cpu_to_le32(0xC015000C)
-#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET cpu_to_le32(0xC015000E)
-#define STATUS_SXS_EARLY_DEACTIVATION cpu_to_le32(0xC015000F)
-#define STATUS_SXS_INVALID_DEACTIVATION cpu_to_le32(0xC0150010)
-#define STATUS_SXS_MULTIPLE_DEACTIVATION cpu_to_le32(0xC0150011)
-#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY \
- cpu_to_le32(0xC0150012)
-#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED cpu_to_le32(0xC0150013)
-#define STATUS_SXS_CORRUPT_ACTIVATION_STACK cpu_to_le32(0xC0150014)
-#define STATUS_SXS_CORRUPTION cpu_to_le32(0xC0150015)
-#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE cpu_to_le32(0xC0150016)
-#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME cpu_to_le32(0xC0150017)
-#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE cpu_to_le32(0xC0150018)
-#define STATUS_SXS_IDENTITY_PARSE_ERROR cpu_to_le32(0xC0150019)
-#define STATUS_SXS_COMPONENT_STORE_CORRUPT cpu_to_le32(0xC015001A)
-#define STATUS_SXS_FILE_HASH_MISMATCH cpu_to_le32(0xC015001B)
-#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT \
- cpu_to_le32(0xC015001C)
-#define STATUS_SXS_IDENTITIES_DIFFERENT cpu_to_le32(0xC015001D)
-#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT cpu_to_le32(0xC015001E)
-#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY cpu_to_le32(0xC015001F)
-#define STATUS_ADVANCED_INSTALLER_FAILED cpu_to_le32(0xC0150020)
-#define STATUS_XML_ENCODING_MISMATCH cpu_to_le32(0xC0150021)
-#define STATUS_SXS_MANIFEST_TOO_BIG cpu_to_le32(0xC0150022)
-#define STATUS_SXS_SETTING_NOT_REGISTERED cpu_to_le32(0xC0150023)
-#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE cpu_to_le32(0xC0150024)
-#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED cpu_to_le32(0xC0150025)
-#define STATUS_GENERIC_COMMAND_FAILED cpu_to_le32(0xC0150026)
-#define STATUS_SXS_FILE_HASH_MISSING cpu_to_le32(0xC0150027)
-#define STATUS_TRANSACTIONAL_CONFLICT cpu_to_le32(0xC0190001)
-#define STATUS_INVALID_TRANSACTION cpu_to_le32(0xC0190002)
-#define STATUS_TRANSACTION_NOT_ACTIVE cpu_to_le32(0xC0190003)
-#define STATUS_TM_INITIALIZATION_FAILED cpu_to_le32(0xC0190004)
-#define STATUS_RM_NOT_ACTIVE cpu_to_le32(0xC0190005)
-#define STATUS_RM_METADATA_CORRUPT cpu_to_le32(0xC0190006)
-#define STATUS_TRANSACTION_NOT_JOINED cpu_to_le32(0xC0190007)
-#define STATUS_DIRECTORY_NOT_RM cpu_to_le32(0xC0190008)
-#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE cpu_to_le32(0xC019000A)
-#define STATUS_LOG_RESIZE_INVALID_SIZE cpu_to_le32(0xC019000B)
-#define STATUS_REMOTE_FILE_VERSION_MISMATCH cpu_to_le32(0xC019000C)
-#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS cpu_to_le32(0xC019000F)
-#define STATUS_TRANSACTION_PROPAGATION_FAILED cpu_to_le32(0xC0190010)
-#define STATUS_CRM_PROTOCOL_NOT_FOUND cpu_to_le32(0xC0190011)
-#define STATUS_TRANSACTION_SUPERIOR_EXISTS cpu_to_le32(0xC0190012)
-#define STATUS_TRANSACTION_REQUEST_NOT_VALID cpu_to_le32(0xC0190013)
-#define STATUS_TRANSACTION_NOT_REQUESTED cpu_to_le32(0xC0190014)
-#define STATUS_TRANSACTION_ALREADY_ABORTED cpu_to_le32(0xC0190015)
-#define STATUS_TRANSACTION_ALREADY_COMMITTED cpu_to_le32(0xC0190016)
-#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER cpu_to_le32(0xC0190017)
-#define STATUS_CURRENT_TRANSACTION_NOT_VALID cpu_to_le32(0xC0190018)
-#define STATUS_LOG_GROWTH_FAILED cpu_to_le32(0xC0190019)
-#define STATUS_OBJECT_NO_LONGER_EXISTS cpu_to_le32(0xC0190021)
-#define STATUS_STREAM_MINIVERSION_NOT_FOUND cpu_to_le32(0xC0190022)
-#define STATUS_STREAM_MINIVERSION_NOT_VALID cpu_to_le32(0xC0190023)
-#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION \
- cpu_to_le32(0xC0190024)
-#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT cpu_to_le32(0xC0190025)
-#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS cpu_to_le32(0xC0190026)
-#define STATUS_HANDLE_NO_LONGER_VALID cpu_to_le32(0xC0190028)
-#define STATUS_LOG_CORRUPTION_DETECTED cpu_to_le32(0xC0190030)
-#define STATUS_RM_DISCONNECTED cpu_to_le32(0xC0190032)
-#define STATUS_ENLISTMENT_NOT_SUPERIOR cpu_to_le32(0xC0190033)
-#define STATUS_FILE_IDENTITY_NOT_PERSISTENT cpu_to_le32(0xC0190036)
-#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY cpu_to_le32(0xC0190037)
-#define STATUS_CANT_CROSS_RM_BOUNDARY cpu_to_le32(0xC0190038)
-#define STATUS_TXF_DIR_NOT_EMPTY cpu_to_le32(0xC0190039)
-#define STATUS_INDOUBT_TRANSACTIONS_EXIST cpu_to_le32(0xC019003A)
-#define STATUS_TM_VOLATILE cpu_to_le32(0xC019003B)
-#define STATUS_ROLLBACK_TIMER_EXPIRED cpu_to_le32(0xC019003C)
-#define STATUS_TXF_ATTRIBUTE_CORRUPT cpu_to_le32(0xC019003D)
-#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC019003E)
-#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED cpu_to_le32(0xC019003F)
-#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE cpu_to_le32(0xC0190040)
-#define STATUS_TRANSACTION_REQUIRED_PROMOTION cpu_to_le32(0xC0190043)
-#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION cpu_to_le32(0xC0190044)
-#define STATUS_TRANSACTIONS_NOT_FROZEN cpu_to_le32(0xC0190045)
-#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS cpu_to_le32(0xC0190046)
-#define STATUS_NOT_SNAPSHOT_VOLUME cpu_to_le32(0xC0190047)
-#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES cpu_to_le32(0xC0190048)
-#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190049)
-#define STATUS_TM_IDENTITY_MISMATCH cpu_to_le32(0xC019004A)
-#define STATUS_FLOATED_SECTION cpu_to_le32(0xC019004B)
-#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK cpu_to_le32(0xC019004C)
-#define STATUS_CANNOT_ABORT_TRANSACTIONS cpu_to_le32(0xC019004D)
-#define STATUS_TRANSACTION_NOT_FOUND cpu_to_le32(0xC019004E)
-#define STATUS_RESOURCEMANAGER_NOT_FOUND cpu_to_le32(0xC019004F)
-#define STATUS_ENLISTMENT_NOT_FOUND cpu_to_le32(0xC0190050)
-#define STATUS_TRANSACTIONMANAGER_NOT_FOUND cpu_to_le32(0xC0190051)
-#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE cpu_to_le32(0xC0190052)
-#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION \
- cpu_to_le32(0xC0190053)
-#define STATUS_TRANSACTION_NOT_ROOT cpu_to_le32(0xC0190054)
-#define STATUS_TRANSACTION_OBJECT_EXPIRED cpu_to_le32(0xC0190055)
-#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190056)
-#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED cpu_to_le32(0xC0190057)
-#define STATUS_TRANSACTION_RECORD_TOO_LONG cpu_to_le32(0xC0190058)
-#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION cpu_to_le32(0xC0190059)
-#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION cpu_to_le32(0xC019005A)
-#define STATUS_TRANSACTION_INTEGRITY_VIOLATED cpu_to_le32(0xC019005B)
-#define STATUS_LOG_SECTOR_INVALID cpu_to_le32(0xC01A0001)
-#define STATUS_LOG_SECTOR_PARITY_INVALID cpu_to_le32(0xC01A0002)
-#define STATUS_LOG_SECTOR_REMAPPED cpu_to_le32(0xC01A0003)
-#define STATUS_LOG_BLOCK_INCOMPLETE cpu_to_le32(0xC01A0004)
-#define STATUS_LOG_INVALID_RANGE cpu_to_le32(0xC01A0005)
-#define STATUS_LOG_BLOCKS_EXHAUSTED cpu_to_le32(0xC01A0006)
-#define STATUS_LOG_READ_CONTEXT_INVALID cpu_to_le32(0xC01A0007)
-#define STATUS_LOG_RESTART_INVALID cpu_to_le32(0xC01A0008)
-#define STATUS_LOG_BLOCK_VERSION cpu_to_le32(0xC01A0009)
-#define STATUS_LOG_BLOCK_INVALID cpu_to_le32(0xC01A000A)
-#define STATUS_LOG_READ_MODE_INVALID cpu_to_le32(0xC01A000B)
-#define STATUS_LOG_METADATA_CORRUPT cpu_to_le32(0xC01A000D)
-#define STATUS_LOG_METADATA_INVALID cpu_to_le32(0xC01A000E)
-#define STATUS_LOG_METADATA_INCONSISTENT cpu_to_le32(0xC01A000F)
-#define STATUS_LOG_RESERVATION_INVALID cpu_to_le32(0xC01A0010)
-#define STATUS_LOG_CANT_DELETE cpu_to_le32(0xC01A0011)
-#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED cpu_to_le32(0xC01A0012)
-#define STATUS_LOG_START_OF_LOG cpu_to_le32(0xC01A0013)
-#define STATUS_LOG_POLICY_ALREADY_INSTALLED cpu_to_le32(0xC01A0014)
-#define STATUS_LOG_POLICY_NOT_INSTALLED cpu_to_le32(0xC01A0015)
-#define STATUS_LOG_POLICY_INVALID cpu_to_le32(0xC01A0016)
-#define STATUS_LOG_POLICY_CONFLICT cpu_to_le32(0xC01A0017)
-#define STATUS_LOG_PINNED_ARCHIVE_TAIL cpu_to_le32(0xC01A0018)
-#define STATUS_LOG_RECORD_NONEXISTENT cpu_to_le32(0xC01A0019)
-#define STATUS_LOG_RECORDS_RESERVED_INVALID cpu_to_le32(0xC01A001A)
-#define STATUS_LOG_SPACE_RESERVED_INVALID cpu_to_le32(0xC01A001B)
-#define STATUS_LOG_TAIL_INVALID cpu_to_le32(0xC01A001C)
-#define STATUS_LOG_FULL cpu_to_le32(0xC01A001D)
-#define STATUS_LOG_MULTIPLEXED cpu_to_le32(0xC01A001E)
-#define STATUS_LOG_DEDICATED cpu_to_le32(0xC01A001F)
-#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS cpu_to_le32(0xC01A0020)
-#define STATUS_LOG_ARCHIVE_IN_PROGRESS cpu_to_le32(0xC01A0021)
-#define STATUS_LOG_EPHEMERAL cpu_to_le32(0xC01A0022)
-#define STATUS_LOG_NOT_ENOUGH_CONTAINERS cpu_to_le32(0xC01A0023)
-#define STATUS_LOG_CLIENT_ALREADY_REGISTERED cpu_to_le32(0xC01A0024)
-#define STATUS_LOG_CLIENT_NOT_REGISTERED cpu_to_le32(0xC01A0025)
-#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS cpu_to_le32(0xC01A0026)
-#define STATUS_LOG_CONTAINER_READ_FAILED cpu_to_le32(0xC01A0027)
-#define STATUS_LOG_CONTAINER_WRITE_FAILED cpu_to_le32(0xC01A0028)
-#define STATUS_LOG_CONTAINER_OPEN_FAILED cpu_to_le32(0xC01A0029)
-#define STATUS_LOG_CONTAINER_STATE_INVALID cpu_to_le32(0xC01A002A)
-#define STATUS_LOG_STATE_INVALID cpu_to_le32(0xC01A002B)
-#define STATUS_LOG_PINNED cpu_to_le32(0xC01A002C)
-#define STATUS_LOG_METADATA_FLUSH_FAILED cpu_to_le32(0xC01A002D)
-#define STATUS_LOG_INCONSISTENT_SECURITY cpu_to_le32(0xC01A002E)
-#define STATUS_LOG_APPENDED_FLUSH_FAILED cpu_to_le32(0xC01A002F)
-#define STATUS_LOG_PINNED_RESERVATION cpu_to_le32(0xC01A0030)
-#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC01B00EA)
-#define STATUS_FLT_NO_HANDLER_DEFINED cpu_to_le32(0xC01C0001)
-#define STATUS_FLT_CONTEXT_ALREADY_DEFINED cpu_to_le32(0xC01C0002)
-#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST cpu_to_le32(0xC01C0003)
-#define STATUS_FLT_DISALLOW_FAST_IO cpu_to_le32(0xC01C0004)
-#define STATUS_FLT_INVALID_NAME_REQUEST cpu_to_le32(0xC01C0005)
-#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION cpu_to_le32(0xC01C0006)
-#define STATUS_FLT_NOT_INITIALIZED cpu_to_le32(0xC01C0007)
-#define STATUS_FLT_FILTER_NOT_READY cpu_to_le32(0xC01C0008)
-#define STATUS_FLT_POST_OPERATION_CLEANUP cpu_to_le32(0xC01C0009)
-#define STATUS_FLT_INTERNAL_ERROR cpu_to_le32(0xC01C000A)
-#define STATUS_FLT_DELETING_OBJECT cpu_to_le32(0xC01C000B)
-#define STATUS_FLT_MUST_BE_NONPAGED_POOL cpu_to_le32(0xC01C000C)
-#define STATUS_FLT_DUPLICATE_ENTRY cpu_to_le32(0xC01C000D)
-#define STATUS_FLT_CBDQ_DISABLED cpu_to_le32(0xC01C000E)
-#define STATUS_FLT_DO_NOT_ATTACH cpu_to_le32(0xC01C000F)
-#define STATUS_FLT_DO_NOT_DETACH cpu_to_le32(0xC01C0010)
-#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION cpu_to_le32(0xC01C0011)
-#define STATUS_FLT_INSTANCE_NAME_COLLISION cpu_to_le32(0xC01C0012)
-#define STATUS_FLT_FILTER_NOT_FOUND cpu_to_le32(0xC01C0013)
-#define STATUS_FLT_VOLUME_NOT_FOUND cpu_to_le32(0xC01C0014)
-#define STATUS_FLT_INSTANCE_NOT_FOUND cpu_to_le32(0xC01C0015)
-#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND cpu_to_le32(0xC01C0016)
-#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION cpu_to_le32(0xC01C0017)
-#define STATUS_FLT_NAME_CACHE_MISS cpu_to_le32(0xC01C0018)
-#define STATUS_FLT_NO_DEVICE_OBJECT cpu_to_le32(0xC01C0019)
-#define STATUS_FLT_VOLUME_ALREADY_MOUNTED cpu_to_le32(0xC01C001A)
-#define STATUS_FLT_ALREADY_ENLISTED cpu_to_le32(0xC01C001B)
-#define STATUS_FLT_CONTEXT_ALREADY_LINKED cpu_to_le32(0xC01C001C)
-#define STATUS_FLT_NO_WAITER_FOR_REPLY cpu_to_le32(0xC01C0020)
-#define STATUS_MONITOR_NO_DESCRIPTOR cpu_to_le32(0xC01D0001)
-#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT cpu_to_le32(0xC01D0002)
-#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM cpu_to_le32(0xC01D0003)
-#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK cpu_to_le32(0xC01D0004)
-#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED cpu_to_le32(0xC01D0005)
-#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK \
- cpu_to_le32(0xC01D0006)
-#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK \
- cpu_to_le32(0xC01D0007)
-#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA cpu_to_le32(0xC01D0008)
-#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK cpu_to_le32(0xC01D0009)
-#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER cpu_to_le32(0xC01E0000)
-#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER cpu_to_le32(0xC01E0001)
-#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER cpu_to_le32(0xC01E0002)
-#define STATUS_GRAPHICS_ADAPTER_WAS_RESET cpu_to_le32(0xC01E0003)
-#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL cpu_to_le32(0xC01E0004)
-#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED cpu_to_le32(0xC01E0005)
-#define STATUS_GRAPHICS_PRESENT_OCCLUDED cpu_to_le32(0xC01E0006)
-#define STATUS_GRAPHICS_PRESENT_DENIED cpu_to_le32(0xC01E0007)
-#define STATUS_GRAPHICS_CANNOTCOLORCONVERT cpu_to_le32(0xC01E0008)
-#define STATUS_GRAPHICS_NO_VIDEO_MEMORY cpu_to_le32(0xC01E0100)
-#define STATUS_GRAPHICS_CANT_LOCK_MEMORY cpu_to_le32(0xC01E0101)
-#define STATUS_GRAPHICS_ALLOCATION_BUSY cpu_to_le32(0xC01E0102)
-#define STATUS_GRAPHICS_TOO_MANY_REFERENCES cpu_to_le32(0xC01E0103)
-#define STATUS_GRAPHICS_TRY_AGAIN_LATER cpu_to_le32(0xC01E0104)
-#define STATUS_GRAPHICS_TRY_AGAIN_NOW cpu_to_le32(0xC01E0105)
-#define STATUS_GRAPHICS_ALLOCATION_INVALID cpu_to_le32(0xC01E0106)
-#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE cpu_to_le32(0xC01E0107)
-#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED cpu_to_le32(0xC01E0108)
-#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION cpu_to_le32(0xC01E0109)
-#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE cpu_to_le32(0xC01E0110)
-#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION cpu_to_le32(0xC01E0111)
-#define STATUS_GRAPHICS_ALLOCATION_CLOSED cpu_to_le32(0xC01E0112)
-#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE cpu_to_le32(0xC01E0113)
-#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE cpu_to_le32(0xC01E0114)
-#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE cpu_to_le32(0xC01E0115)
-#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST cpu_to_le32(0xC01E0116)
-#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE cpu_to_le32(0xC01E0200)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0300)
-#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED cpu_to_le32(0xC01E0301)
-#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED \
- cpu_to_le32(0xC01E0302)
-#define STATUS_GRAPHICS_INVALID_VIDPN cpu_to_le32(0xC01E0303)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE cpu_to_le32(0xC01E0304)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET cpu_to_le32(0xC01E0305)
-#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED cpu_to_le32(0xC01E0306)
-#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET cpu_to_le32(0xC01E0308)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET cpu_to_le32(0xC01E0309)
-#define STATUS_GRAPHICS_INVALID_FREQUENCY cpu_to_le32(0xC01E030A)
-#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION cpu_to_le32(0xC01E030B)
-#define STATUS_GRAPHICS_INVALID_TOTAL_REGION cpu_to_le32(0xC01E030C)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE \
- cpu_to_le32(0xC01E0310)
-#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE \
- cpu_to_le32(0xC01E0311)
-#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET cpu_to_le32(0xC01E0312)
-#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY cpu_to_le32(0xC01E0313)
-#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET cpu_to_le32(0xC01E0314)
-#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET cpu_to_le32(0xC01E0315)
-#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET cpu_to_le32(0xC01E0316)
-#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET cpu_to_le32(0xC01E0317)
-#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET cpu_to_le32(0xC01E0318)
-#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH cpu_to_le32(0xC01E0319)
-#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY cpu_to_le32(0xC01E031A)
-#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET \
- cpu_to_le32(0xC01E031B)
-#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE cpu_to_le32(0xC01E031C)
-#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET cpu_to_le32(0xC01E031D)
-#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET cpu_to_le32(0xC01E031F)
-#define STATUS_GRAPHICS_STALE_MODESET cpu_to_le32(0xC01E0320)
-#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET cpu_to_le32(0xC01E0321)
-#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE cpu_to_le32(0xC01E0322)
-#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN cpu_to_le32(0xC01E0323)
-#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0324)
-#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION \
- cpu_to_le32(0xC01E0325)
-#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES \
- cpu_to_le32(0xC01E0326)
-#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0327)
-#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE \
- cpu_to_le32(0xC01E0328)
-#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET \
- cpu_to_le32(0xC01E0329)
-#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET cpu_to_le32(0xC01E032A)
-#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR cpu_to_le32(0xC01E032B)
-#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET cpu_to_le32(0xC01E032C)
-#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET cpu_to_le32(0xC01E032D)
-#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE \
- cpu_to_le32(0xC01E032E)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE cpu_to_le32(0xC01E032F)
-#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED cpu_to_le32(0xC01E0330)
-#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0331)
-#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0332)
-#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET cpu_to_le32(0xC01E0333)
-#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER \
- cpu_to_le32(0xC01E0334)
-#define STATUS_GRAPHICS_NO_VIDPNMGR cpu_to_le32(0xC01E0335)
-#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN cpu_to_le32(0xC01E0336)
-#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0337)
-#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED cpu_to_le32(0xC01E0338)
-#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0339)
-#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE cpu_to_le32(0xC01E033A)
-#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE cpu_to_le32(0xC01E033B)
-#define STATUS_GRAPHICS_INVALID_STRIDE cpu_to_le32(0xC01E033C)
-#define STATUS_GRAPHICS_INVALID_PIXELFORMAT cpu_to_le32(0xC01E033D)
-#define STATUS_GRAPHICS_INVALID_COLORBASIS cpu_to_le32(0xC01E033E)
-#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE cpu_to_le32(0xC01E033F)
-#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0340)
-#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT \
- cpu_to_le32(0xC01E0341)
-#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE cpu_to_le32(0xC01E0342)
-#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN cpu_to_le32(0xC01E0343)
-#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL cpu_to_le32(0xC01E0344)
-#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION \
- cpu_to_le32(0xC01E0345)
-#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED \
- cpu_to_le32(0xC01E0346)
-#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP cpu_to_le32(0xC01E0347)
-#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED cpu_to_le32(0xC01E0348)
-#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED cpu_to_le32(0xC01E0349)
-#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET cpu_to_le32(0xC01E034A)
-#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON \
- cpu_to_le32(0xC01E034D)
-#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE cpu_to_le32(0xC01E034E)
-#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE cpu_to_le32(0xC01E034F)
-#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS \
- cpu_to_le32(0xC01E0350)
-#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING cpu_to_le32(0xC01E0352)
-#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED cpu_to_le32(0xC01E0353)
-#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS cpu_to_le32(0xC01E0354)
-#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT cpu_to_le32(0xC01E0355)
-#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM cpu_to_le32(0xC01E0356)
-#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN \
- cpu_to_le32(0xC01E0357)
-#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT \
- cpu_to_le32(0xC01E0358)
-#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED cpu_to_le32(0xC01E0359)
-#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION \
- cpu_to_le32(0xC01E035A)
-#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE cpu_to_le32(0xC01E035B)
-#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET cpu_to_le32(0xC01E035C)
-#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED \
- cpu_to_le32(0xC01E0400)
-#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED cpu_to_le32(0xC01E0401)
-#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER cpu_to_le32(0xC01E0430)
-#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED cpu_to_le32(0xC01E0431)
-#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED cpu_to_le32(0xC01E0432)
-#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY cpu_to_le32(0xC01E0433)
-#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED cpu_to_le32(0xC01E0434)
-#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON cpu_to_le32(0xC01E0435)
-#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE cpu_to_le32(0xC01E0436)
-#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER cpu_to_le32(0xC01E0438)
-#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED cpu_to_le32(0xC01E043B)
-#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS \
- cpu_to_le32(0xC01E051C)
-#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST cpu_to_le32(0xC01E051D)
-#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC01E051E)
-#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS \
- cpu_to_le32(0xC01E051F)
-#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED cpu_to_le32(0xC01E0520)
-#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST \
- cpu_to_le32(0xC01E0521)
-#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED cpu_to_le32(0xC01E0500)
-#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED cpu_to_le32(0xC01E0501)
-#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED cpu_to_le32(0xC01E0502)
-#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS cpu_to_le32(0xC01E0503)
-#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E0504)
-#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST cpu_to_le32(0xC01E0505)
-#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \
- cpu_to_le32(0xC01E0506)
-#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \
- cpu_to_le32(0xC01E0507)
-#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED \
- cpu_to_le32(0xC01E0508)
-#define STATUS_GRAPHICS_OPM_INVALID_POINTER cpu_to_le32(0xC01E050A)
-#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR cpu_to_le32(0xC01E050B)
-#define STATUS_GRAPHICS_OPM_INVALID_HANDLE cpu_to_le32(0xC01E050C)
-#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \
- cpu_to_le32(0xC01E050D)
-#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH cpu_to_le32(0xC01E050E)
-#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED cpu_to_le32(0xC01E050F)
-#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED cpu_to_le32(0xC01E0510)
-#define STATUS_GRAPHICS_PVP_HFS_FAILED cpu_to_le32(0xC01E0511)
-#define STATUS_GRAPHICS_OPM_INVALID_SRM cpu_to_le32(0xC01E0512)
-#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP cpu_to_le32(0xC01E0513)
-#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP cpu_to_le32(0xC01E0514)
-#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA \
- cpu_to_le32(0xC01E0515)
-#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET cpu_to_le32(0xC01E0516)
-#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH cpu_to_le32(0xC01E0517)
-#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE \
- cpu_to_le32(0xC01E0518)
-#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS \
- cpu_to_le32(0xC01E051A)
-#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS \
- cpu_to_le32(0xC01E051B)
-#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED cpu_to_le32(0xC01E0580)
-#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC01E0581)
-#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA cpu_to_le32(0xC01E0582)
-#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA cpu_to_le32(0xC01E0583)
-#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED cpu_to_le32(0xC01E0584)
-#define STATUS_GRAPHICS_DDCCI_INVALID_DATA cpu_to_le32(0xC01E0585)
-#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE \
- cpu_to_le32(0xC01E0586)
-#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING \
- cpu_to_le32(0xC01E0587)
-#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR cpu_to_le32(0xC01E0588)
-#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND cpu_to_le32(0xC01E0589)
-#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH cpu_to_le32(0xC01E058A)
-#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM cpu_to_le32(0xC01E058B)
-#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE cpu_to_le32(0xC01E058C)
-#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS cpu_to_le32(0xC01E058D)
-#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED cpu_to_le32(0xC01E05E0)
-#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \
- cpu_to_le32(0xC01E05E1)
-#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \
- cpu_to_le32(0xC01E05E2)
-#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED cpu_to_le32(0xC01E05E3)
-#define STATUS_GRAPHICS_INVALID_POINTER cpu_to_le32(0xC01E05E4)
-#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \
- cpu_to_le32(0xC01E05E5)
-#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E05E6)
-#define STATUS_GRAPHICS_INTERNAL_ERROR cpu_to_le32(0xC01E05E7)
-#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS cpu_to_le32(0xC01E05E8)
-#define STATUS_FVE_LOCKED_VOLUME cpu_to_le32(0xC0210000)
-#define STATUS_FVE_NOT_ENCRYPTED cpu_to_le32(0xC0210001)
-#define STATUS_FVE_BAD_INFORMATION cpu_to_le32(0xC0210002)
-#define STATUS_FVE_TOO_SMALL cpu_to_le32(0xC0210003)
-#define STATUS_FVE_FAILED_WRONG_FS cpu_to_le32(0xC0210004)
-#define STATUS_FVE_FAILED_BAD_FS cpu_to_le32(0xC0210005)
-#define STATUS_FVE_FS_NOT_EXTENDED cpu_to_le32(0xC0210006)
-#define STATUS_FVE_FS_MOUNTED cpu_to_le32(0xC0210007)
-#define STATUS_FVE_NO_LICENSE cpu_to_le32(0xC0210008)
-#define STATUS_FVE_ACTION_NOT_ALLOWED cpu_to_le32(0xC0210009)
-#define STATUS_FVE_BAD_DATA cpu_to_le32(0xC021000A)
-#define STATUS_FVE_VOLUME_NOT_BOUND cpu_to_le32(0xC021000B)
-#define STATUS_FVE_NOT_DATA_VOLUME cpu_to_le32(0xC021000C)
-#define STATUS_FVE_CONV_READ_ERROR cpu_to_le32(0xC021000D)
-#define STATUS_FVE_CONV_WRITE_ERROR cpu_to_le32(0xC021000E)
-#define STATUS_FVE_OVERLAPPED_UPDATE cpu_to_le32(0xC021000F)
-#define STATUS_FVE_FAILED_SECTOR_SIZE cpu_to_le32(0xC0210010)
-#define STATUS_FVE_FAILED_AUTHENTICATION cpu_to_le32(0xC0210011)
-#define STATUS_FVE_NOT_OS_VOLUME cpu_to_le32(0xC0210012)
-#define STATUS_FVE_KEYFILE_NOT_FOUND cpu_to_le32(0xC0210013)
-#define STATUS_FVE_KEYFILE_INVALID cpu_to_le32(0xC0210014)
-#define STATUS_FVE_KEYFILE_NO_VMK cpu_to_le32(0xC0210015)
-#define STATUS_FVE_TPM_DISABLED cpu_to_le32(0xC0210016)
-#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO cpu_to_le32(0xC0210017)
-#define STATUS_FVE_TPM_INVALID_PCR cpu_to_le32(0xC0210018)
-#define STATUS_FVE_TPM_NO_VMK cpu_to_le32(0xC0210019)
-#define STATUS_FVE_PIN_INVALID cpu_to_le32(0xC021001A)
-#define STATUS_FVE_AUTH_INVALID_APPLICATION cpu_to_le32(0xC021001B)
-#define STATUS_FVE_AUTH_INVALID_CONFIG cpu_to_le32(0xC021001C)
-#define STATUS_FVE_DEBUGGER_ENABLED cpu_to_le32(0xC021001D)
-#define STATUS_FVE_DRY_RUN_FAILED cpu_to_le32(0xC021001E)
-#define STATUS_FVE_BAD_METADATA_POINTER cpu_to_le32(0xC021001F)
-#define STATUS_FVE_OLD_METADATA_COPY cpu_to_le32(0xC0210020)
-#define STATUS_FVE_REBOOT_REQUIRED cpu_to_le32(0xC0210021)
-#define STATUS_FVE_RAW_ACCESS cpu_to_le32(0xC0210022)
-#define STATUS_FVE_RAW_BLOCKED cpu_to_le32(0xC0210023)
-#define STATUS_FWP_CALLOUT_NOT_FOUND cpu_to_le32(0xC0220001)
-#define STATUS_FWP_CONDITION_NOT_FOUND cpu_to_le32(0xC0220002)
-#define STATUS_FWP_FILTER_NOT_FOUND cpu_to_le32(0xC0220003)
-#define STATUS_FWP_LAYER_NOT_FOUND cpu_to_le32(0xC0220004)
-#define STATUS_FWP_PROVIDER_NOT_FOUND cpu_to_le32(0xC0220005)
-#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND cpu_to_le32(0xC0220006)
-#define STATUS_FWP_SUBLAYER_NOT_FOUND cpu_to_le32(0xC0220007)
-#define STATUS_FWP_NOT_FOUND cpu_to_le32(0xC0220008)
-#define STATUS_FWP_ALREADY_EXISTS cpu_to_le32(0xC0220009)
-#define STATUS_FWP_IN_USE cpu_to_le32(0xC022000A)
-#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS cpu_to_le32(0xC022000B)
-#define STATUS_FWP_WRONG_SESSION cpu_to_le32(0xC022000C)
-#define STATUS_FWP_NO_TXN_IN_PROGRESS cpu_to_le32(0xC022000D)
-#define STATUS_FWP_TXN_IN_PROGRESS cpu_to_le32(0xC022000E)
-#define STATUS_FWP_TXN_ABORTED cpu_to_le32(0xC022000F)
-#define STATUS_FWP_SESSION_ABORTED cpu_to_le32(0xC0220010)
-#define STATUS_FWP_INCOMPATIBLE_TXN cpu_to_le32(0xC0220011)
-#define STATUS_FWP_TIMEOUT cpu_to_le32(0xC0220012)
-#define STATUS_FWP_NET_EVENTS_DISABLED cpu_to_le32(0xC0220013)
-#define STATUS_FWP_INCOMPATIBLE_LAYER cpu_to_le32(0xC0220014)
-#define STATUS_FWP_KM_CLIENTS_ONLY cpu_to_le32(0xC0220015)
-#define STATUS_FWP_LIFETIME_MISMATCH cpu_to_le32(0xC0220016)
-#define STATUS_FWP_BUILTIN_OBJECT cpu_to_le32(0xC0220017)
-#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS cpu_to_le32(0xC0220018)
-#define STATUS_FWP_TOO_MANY_CALLOUTS cpu_to_le32(0xC0220018)
-#define STATUS_FWP_NOTIFICATION_DROPPED cpu_to_le32(0xC0220019)
-#define STATUS_FWP_TRAFFIC_MISMATCH cpu_to_le32(0xC022001A)
-#define STATUS_FWP_INCOMPATIBLE_SA_STATE cpu_to_le32(0xC022001B)
-#define STATUS_FWP_NULL_POINTER cpu_to_le32(0xC022001C)
-#define STATUS_FWP_INVALID_ENUMERATOR cpu_to_le32(0xC022001D)
-#define STATUS_FWP_INVALID_FLAGS cpu_to_le32(0xC022001E)
-#define STATUS_FWP_INVALID_NET_MASK cpu_to_le32(0xC022001F)
-#define STATUS_FWP_INVALID_RANGE cpu_to_le32(0xC0220020)
-#define STATUS_FWP_INVALID_INTERVAL cpu_to_le32(0xC0220021)
-#define STATUS_FWP_ZERO_LENGTH_ARRAY cpu_to_le32(0xC0220022)
-#define STATUS_FWP_NULL_DISPLAY_NAME cpu_to_le32(0xC0220023)
-#define STATUS_FWP_INVALID_ACTION_TYPE cpu_to_le32(0xC0220024)
-#define STATUS_FWP_INVALID_WEIGHT cpu_to_le32(0xC0220025)
-#define STATUS_FWP_MATCH_TYPE_MISMATCH cpu_to_le32(0xC0220026)
-#define STATUS_FWP_TYPE_MISMATCH cpu_to_le32(0xC0220027)
-#define STATUS_FWP_OUT_OF_BOUNDS cpu_to_le32(0xC0220028)
-#define STATUS_FWP_RESERVED cpu_to_le32(0xC0220029)
-#define STATUS_FWP_DUPLICATE_CONDITION cpu_to_le32(0xC022002A)
-#define STATUS_FWP_DUPLICATE_KEYMOD cpu_to_le32(0xC022002B)
-#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002C)
-#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER cpu_to_le32(0xC022002D)
-#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002E)
-#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT cpu_to_le32(0xC022002F)
-#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD cpu_to_le32(0xC0220030)
-#define STATUS_FWP_INCOMPATIBLE_DH_GROUP cpu_to_le32(0xC0220031)
-#define STATUS_FWP_EM_NOT_SUPPORTED cpu_to_le32(0xC0220032)
-#define STATUS_FWP_NEVER_MATCH cpu_to_le32(0xC0220033)
-#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH cpu_to_le32(0xC0220034)
-#define STATUS_FWP_INVALID_PARAMETER cpu_to_le32(0xC0220035)
-#define STATUS_FWP_TOO_MANY_SUBLAYERS cpu_to_le32(0xC0220036)
-#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED cpu_to_le32(0xC0220037)
-#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG cpu_to_le32(0xC0220038)
-#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG cpu_to_le32(0xC0220039)
-#define STATUS_FWP_TCPIP_NOT_READY cpu_to_le32(0xC0220100)
-#define STATUS_FWP_INJECT_HANDLE_CLOSING cpu_to_le32(0xC0220101)
-#define STATUS_FWP_INJECT_HANDLE_STALE cpu_to_le32(0xC0220102)
-#define STATUS_FWP_CANNOT_PEND cpu_to_le32(0xC0220103)
-#define STATUS_NDIS_CLOSING cpu_to_le32(0xC0230002)
-#define STATUS_NDIS_BAD_VERSION cpu_to_le32(0xC0230004)
-#define STATUS_NDIS_BAD_CHARACTERISTICS cpu_to_le32(0xC0230005)
-#define STATUS_NDIS_ADAPTER_NOT_FOUND cpu_to_le32(0xC0230006)
-#define STATUS_NDIS_OPEN_FAILED cpu_to_le32(0xC0230007)
-#define STATUS_NDIS_DEVICE_FAILED cpu_to_le32(0xC0230008)
-#define STATUS_NDIS_MULTICAST_FULL cpu_to_le32(0xC0230009)
-#define STATUS_NDIS_MULTICAST_EXISTS cpu_to_le32(0xC023000A)
-#define STATUS_NDIS_MULTICAST_NOT_FOUND cpu_to_le32(0xC023000B)
-#define STATUS_NDIS_REQUEST_ABORTED cpu_to_le32(0xC023000C)
-#define STATUS_NDIS_RESET_IN_PROGRESS cpu_to_le32(0xC023000D)
-#define STATUS_NDIS_INVALID_PACKET cpu_to_le32(0xC023000F)
-#define STATUS_NDIS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0230010)
-#define STATUS_NDIS_ADAPTER_NOT_READY cpu_to_le32(0xC0230011)
-#define STATUS_NDIS_INVALID_LENGTH cpu_to_le32(0xC0230014)
-#define STATUS_NDIS_INVALID_DATA cpu_to_le32(0xC0230015)
-#define STATUS_NDIS_BUFFER_TOO_SHORT cpu_to_le32(0xC0230016)
-#define STATUS_NDIS_INVALID_OID cpu_to_le32(0xC0230017)
-#define STATUS_NDIS_ADAPTER_REMOVED cpu_to_le32(0xC0230018)
-#define STATUS_NDIS_UNSUPPORTED_MEDIA cpu_to_le32(0xC0230019)
-#define STATUS_NDIS_GROUP_ADDRESS_IN_USE cpu_to_le32(0xC023001A)
-#define STATUS_NDIS_FILE_NOT_FOUND cpu_to_le32(0xC023001B)
-#define STATUS_NDIS_ERROR_READING_FILE cpu_to_le32(0xC023001C)
-#define STATUS_NDIS_ALREADY_MAPPED cpu_to_le32(0xC023001D)
-#define STATUS_NDIS_RESOURCE_CONFLICT cpu_to_le32(0xC023001E)
-#define STATUS_NDIS_MEDIA_DISCONNECTED cpu_to_le32(0xC023001F)
-#define STATUS_NDIS_INVALID_ADDRESS cpu_to_le32(0xC0230022)
-#define STATUS_NDIS_PAUSED cpu_to_le32(0xC023002A)
-#define STATUS_NDIS_INTERFACE_NOT_FOUND cpu_to_le32(0xC023002B)
-#define STATUS_NDIS_UNSUPPORTED_REVISION cpu_to_le32(0xC023002C)
-#define STATUS_NDIS_INVALID_PORT cpu_to_le32(0xC023002D)
-#define STATUS_NDIS_INVALID_PORT_STATE cpu_to_le32(0xC023002E)
-#define STATUS_NDIS_LOW_POWER_STATE cpu_to_le32(0xC023002F)
-#define STATUS_NDIS_NOT_SUPPORTED cpu_to_le32(0xC02300BB)
-#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED cpu_to_le32(0xC0232000)
-#define STATUS_NDIS_DOT11_MEDIA_IN_USE cpu_to_le32(0xC0232001)
-#define STATUS_NDIS_DOT11_POWER_STATE_INVALID cpu_to_le32(0xC0232002)
-#define STATUS_IPSEC_BAD_SPI cpu_to_le32(0xC0360001)
-#define STATUS_IPSEC_SA_LIFETIME_EXPIRED cpu_to_le32(0xC0360002)
-#define STATUS_IPSEC_WRONG_SA cpu_to_le32(0xC0360003)
-#define STATUS_IPSEC_REPLAY_CHECK_FAILED cpu_to_le32(0xC0360004)
-#define STATUS_IPSEC_INVALID_PACKET cpu_to_le32(0xC0360005)
-#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED cpu_to_le32(0xC0360006)
-#define STATUS_IPSEC_CLEAR_TEXT_DROP cpu_to_le32(0xC0360007)
-
-#define STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP cpu_to_le32(0xC05D0000)
-#define STATUS_INVALID_LOCK_RANGE cpu_to_le32(0xC00001a1)
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index cf4418f72772..44c87e300c16 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -21,7 +21,7 @@
#include "glob.h"
#include "connection.h"
#include "smb_common.h"
-#include "smbstatus.h"
+#include "../common/smb2status.h"
#include "transport_rdma.h"
#define SMB_DIRECT_PORT_IWARP 5445
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index a84788396daa..aaed9e293b2e 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -624,8 +624,10 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz)
for_each_netdev(&init_net, netdev) {
if (netif_is_bridge_port(netdev))
continue;
- if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL)))
+ if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) {
+ rtnl_unlock();
return -ENOMEM;
+ }
}
rtnl_unlock();
bind_additional_ifaces = 1;
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 9e859ba010cf..7cbd580120d1 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -496,7 +496,7 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
int err = 0;
if (work->conn->connection_type) {
- if (!(fp->daccess & FILE_WRITE_DATA_LE)) {
+ if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
pr_err("no right to write(%pD)\n", fp->filp);
err = -EACCES;
goto out;
@@ -1115,9 +1115,10 @@ static bool __dir_empty(struct dir_context *ctx, const char *name, int namlen,
struct ksmbd_readdir_data *buf;
buf = container_of(ctx, struct ksmbd_readdir_data, ctx);
- buf->dirent_count++;
+ if (!is_dot_dotdot(name, namlen))
+ buf->dirent_count++;
- return buf->dirent_count <= 2;
+ return !buf->dirent_count;
}
/**
@@ -1137,7 +1138,7 @@ int ksmbd_vfs_empty_dir(struct ksmbd_file *fp)
readdir_data.dirent_count = 0;
err = iterate_dir(fp->filp, &readdir_data.ctx);
- if (readdir_data.dirent_count > 2)
+ if (readdir_data.dirent_count)
err = -ENOTEMPTY;
else
err = 0;
@@ -1166,7 +1167,7 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name,
if (cmp < 0)
cmp = strncasecmp((char *)buf->private, name, namlen);
if (!cmp) {
- memcpy((char *)buf->private, name, namlen);
+ memcpy((char *)buf->private, name, buf->used);
buf->dirent_count = 1;
return false;
}
@@ -1234,10 +1235,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
char *filepath;
size_t path_len, remain_len;
- filepath = kstrdup(name, GFP_KERNEL);
- if (!filepath)
- return -ENOMEM;
-
+ filepath = name;
path_len = strlen(filepath);
remain_len = path_len;
@@ -1280,10 +1278,9 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
err = -EINVAL;
out2:
path_put(parent_path);
-out1:
- kfree(filepath);
}
+out1:
if (!err) {
err = mnt_want_write(parent_path->mnt);
if (err) {
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 4d4ee696e37c..a19f4e563c7e 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -863,6 +863,8 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon,
list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
if (op->conn != conn)
continue;
+ if (op->conn && atomic_dec_and_test(&op->conn->refcnt))
+ kfree(op->conn);
op->conn = NULL;
}
up_write(&ci->m_lock);
@@ -965,6 +967,7 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp)
if (op->conn)
continue;
op->conn = fp->conn;
+ atomic_inc(&op->conn->refcnt);
}
up_write(&ci->m_lock);
diff --git a/fs/smb/server/xattr.h b/fs/smb/server/xattr.h
index 16499ca5c82d..fa3e27d6971b 100644
--- a/fs/smb/server/xattr.h
+++ b/fs/smb/server/xattr.h
@@ -76,7 +76,7 @@ struct xattr_acl_entry {
struct xattr_smb_acl {
int count;
int next;
- struct xattr_acl_entry entries[];
+ struct xattr_acl_entry entries[] __counted_by(count);
};
/* 64bytes hash in xattr_ntacl is computed with sha256 */
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index a8c1e7f9a609..21aaa96856c1 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -494,39 +494,73 @@ out:
}
static int squashfs_readahead_fragment(struct page **page,
- unsigned int pages, unsigned int expected)
+ unsigned int pages, unsigned int expected, loff_t start)
{
struct inode *inode = page[0]->mapping->host;
struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
squashfs_i(inode)->fragment_block,
squashfs_i(inode)->fragment_size);
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
- int error = buffer->error;
+ int i, bytes, copied;
+ struct squashfs_page_actor *actor;
+ unsigned int offset;
+ void *addr;
+ struct page *last_page;
+
+ if (buffer->error)
+ goto out;
- if (error)
+ actor = squashfs_page_actor_init_special(msblk, page, pages,
+ expected, start);
+ if (!actor)
goto out;
- expected += squashfs_i(inode)->fragment_offset;
+ squashfs_actor_nobuff(actor);
+ addr = squashfs_first_page(actor);
+
+ for (copied = offset = 0; offset < expected; offset += PAGE_SIZE) {
+ int avail = min_t(int, expected - offset, PAGE_SIZE);
+
+ if (!IS_ERR(addr)) {
+ bytes = squashfs_copy_data(addr, buffer, offset +
+ squashfs_i(inode)->fragment_offset, avail);
+
+ if (bytes != avail)
+ goto failed;
+ }
+
+ copied += avail;
+ addr = squashfs_next_page(actor);
+ }
- for (n = 0; n < pages; n++) {
- unsigned int base = (page[n]->index & mask) << PAGE_SHIFT;
- unsigned int offset = base + squashfs_i(inode)->fragment_offset;
+ last_page = squashfs_page_actor_free(actor);
- if (expected > offset) {
- unsigned int avail = min_t(unsigned int, expected -
- offset, PAGE_SIZE);
+ if (copied == expected && !IS_ERR(last_page)) {
+ /* Last page (if present) may have trailing bytes not filled */
+ bytes = copied % PAGE_SIZE;
+ if (bytes && last_page)
+ memzero_page(last_page, bytes, PAGE_SIZE - bytes);
- squashfs_fill_page(page[n], buffer, offset, avail);
+ for (i = 0; i < pages; i++) {
+ flush_dcache_page(page[i]);
+ SetPageUptodate(page[i]);
}
+ }
- unlock_page(page[n]);
- put_page(page[n]);
+ for (i = 0; i < pages; i++) {
+ unlock_page(page[i]);
+ put_page(page[i]);
}
+ squashfs_cache_put(buffer);
+ return 0;
+
+failed:
+ squashfs_page_actor_free(actor);
+
out:
squashfs_cache_put(buffer);
- return error;
+ return 1;
}
static void squashfs_readahead(struct readahead_control *ractl)
@@ -551,7 +585,6 @@ static void squashfs_readahead(struct readahead_control *ractl)
return;
for (;;) {
- pgoff_t index;
int res, bsize;
u64 block = 0;
unsigned int expected;
@@ -570,26 +603,21 @@ static void squashfs_readahead(struct readahead_control *ractl)
if (readahead_pos(ractl) >= i_size_read(inode))
goto skip_pages;
- index = pages[0]->index >> shift;
-
- if ((pages[nr_pages - 1]->index >> shift) != index)
- goto skip_pages;
-
- if (index == file_end && squashfs_i(inode)->fragment_block !=
- SQUASHFS_INVALID_BLK) {
+ if (start >> msblk->block_log == file_end &&
+ squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) {
res = squashfs_readahead_fragment(pages, nr_pages,
- expected);
+ expected, start);
if (res)
goto skip_pages;
continue;
}
- bsize = read_blocklist(inode, index, &block);
+ bsize = read_blocklist(inode, start >> msblk->block_log, &block);
if (bsize == 0)
goto skip_pages;
actor = squashfs_page_actor_init_special(msblk, pages, nr_pages,
- expected);
+ expected, start);
if (!actor)
goto skip_pages;
@@ -597,12 +625,12 @@ static void squashfs_readahead(struct readahead_control *ractl)
last_page = squashfs_page_actor_free(actor);
- if (res == expected) {
+ if (res == expected && !IS_ERR(last_page)) {
int bytes;
/* Last page (if present) may have trailing bytes not filled */
bytes = res % PAGE_SIZE;
- if (index == file_end && bytes && last_page)
+ if (start >> msblk->block_log == file_end && bytes && last_page)
memzero_page(last_page, bytes,
PAGE_SIZE - bytes);
@@ -616,6 +644,8 @@ static void squashfs_readahead(struct readahead_control *ractl)
unlock_page(pages[i]);
put_page(pages[i]);
}
+
+ start += readahead_batch_length(ractl);
}
kfree(pages);
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 2a689ce71de9..22251743fadf 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -23,15 +23,15 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int expected)
{
+ struct folio *folio = page_folio(target_page);
struct inode *inode = target_page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-
loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
- loff_t start_index = target_page->index & ~mask;
+ loff_t start_index = folio->index & ~mask;
loff_t end_index = start_index | mask;
int i, n, pages, bytes, res = -ENOMEM;
- struct page **page;
+ struct page **page, *last_page;
struct squashfs_page_actor *actor;
void *pageaddr;
@@ -46,7 +46,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
/* Try to grab all the pages covered by the Squashfs block */
for (i = 0, n = start_index; n <= end_index; n++) {
- page[i] = (n == target_page->index) ? target_page :
+ page[i] = (n == folio->index) ? target_page :
grab_cache_page_nowait(target_page->mapping, n);
if (page[i] == NULL)
@@ -67,27 +67,28 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
* Create a "page actor" which will kmap and kunmap the
* page cache pages appropriately within the decompressor
*/
- actor = squashfs_page_actor_init_special(msblk, page, pages, expected);
+ actor = squashfs_page_actor_init_special(msblk, page, pages, expected,
+ start_index << PAGE_SHIFT);
if (actor == NULL)
goto out;
/* Decompress directly into the page cache buffers */
res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
- squashfs_page_actor_free(actor);
+ last_page = squashfs_page_actor_free(actor);
if (res < 0)
goto mark_errored;
- if (res != expected) {
+ if (res != expected || IS_ERR(last_page)) {
res = -EIO;
goto mark_errored;
}
/* Last page (if present) may have trailing bytes not filled */
bytes = res % PAGE_SIZE;
- if (page[pages - 1]->index == end_index && bytes) {
- pageaddr = kmap_local_page(page[pages - 1]);
+ if (end_index == file_end && last_page && bytes) {
+ pageaddr = kmap_local_page(last_page);
memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
kunmap_local(pageaddr);
}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 16bd693d0b3a..d5918eba27e3 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -279,8 +279,13 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
- set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+ if (inode->i_size > PAGE_SIZE) {
+ ERROR("Corrupted symlink\n");
+ return -EINVAL;
+ }
+
+ set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_op = &squashfs_symlink_inode_ops;
inode_nohighmem(inode);
inode->i_data.a_ops = &squashfs_symlink_aops;
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 81af6c4ca115..2b3e807d4dea 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -60,6 +60,11 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
}
/* Implementation of page_actor for decompressing directly into page cache. */
+static loff_t page_next_index(struct squashfs_page_actor *actor)
+{
+ return page_folio(actor->page[actor->next_page])->index;
+}
+
static void *handle_next_page(struct squashfs_page_actor *actor)
{
int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -68,7 +73,7 @@ static void *handle_next_page(struct squashfs_page_actor *actor)
return NULL;
if ((actor->next_page == actor->pages) ||
- (actor->next_index != actor->page[actor->next_page]->index)) {
+ (actor->next_index != page_next_index(actor))) {
actor->next_index++;
actor->returned_pages++;
actor->last_page = NULL;
@@ -103,7 +108,7 @@ static void direct_finish_page(struct squashfs_page_actor *actor)
}
struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk,
- struct page **page, int pages, int length)
+ struct page **page, int pages, int length, loff_t start_index)
{
struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
@@ -125,7 +130,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_
actor->pages = pages;
actor->next_page = 0;
actor->returned_pages = 0;
- actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1);
+ actor->next_index = start_index >> PAGE_SHIFT;
actor->pageaddr = NULL;
actor->last_page = NULL;
actor->alloc_buffer = msblk->decompressor->alloc_buffer;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 97d4983559b1..ffe25eb77c32 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -29,13 +29,15 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
int pages, int length);
extern struct squashfs_page_actor *squashfs_page_actor_init_special(
struct squashfs_sb_info *msblk,
- struct page **page, int pages, int length);
+ struct page **page, int pages, int length,
+ loff_t start_index);
static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor)
{
- struct page *last_page = actor->last_page;
+ struct page *last_page = actor->next_page == actor->pages ? actor->last_page : ERR_PTR(-EIO);
kfree(actor->tmp_buffer);
kfree(actor);
+
return last_page;
}
static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
diff --git a/fs/super.c b/fs/super.c
index 38d72a3cf6fc..1db230432960 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -621,7 +621,7 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
- cgroup_writeback_umount();
+ cgroup_writeback_umount(sb);
/* Evict all inodes with zero refcount. */
evict_inodes(sb);
@@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc)
return error;
if (!fc->root) {
- pr_err("Filesystem %s get_tree() didn't set fc->root\n",
- fc->fs_type->name);
+ pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n",
+ fc->fs_type->name, error);
/* We don't know what the locking state of the superblock is -
* if there is a superblock.
*/
@@ -1905,7 +1905,7 @@ static void lockdep_sb_freeze_release(struct super_block *sb)
int level;
for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
- percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
+ percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_);
}
/*
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 2e126d72d619..639307e2ff8c 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -28,17 +28,17 @@ const struct file_operations sysv_dir_operations = {
.fsync = generic_file_fsync,
};
-static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *dir = mapping->host;
- block_write_end(NULL, mapping, pos, len, len, page, NULL);
+ block_write_end(NULL, mapping, pos, len, len, folio, NULL);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
- unlock_page(page);
+ folio_unlock(folio);
}
static int sysv_handle_dirsync(struct inode *dir)
@@ -52,20 +52,21 @@ static int sysv_handle_dirsync(struct inode *dir)
}
/*
- * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the
+ * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
* rules documented in mm/highmem.rst.
*
- * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page()
+ * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio()
* and must be treated accordingly for nesting purposes.
*/
-static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p)
+static void *dir_get_folio(struct inode *dir, unsigned long n,
+ struct folio **foliop)
{
- struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
- if (IS_ERR(page))
- return ERR_CAST(page);
- *p = page;
- return kmap_local_page(page);
+ struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL);
+
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
+ *foliop = folio;
+ return kmap_local_folio(folio, 0);
}
static int sysv_readdir(struct file *file, struct dir_context *ctx)
@@ -87,9 +88,9 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
struct sysv_dir_entry *de;
- struct page *page;
+ struct folio *folio;
- kaddr = dir_get_page(inode, n, &page);
+ kaddr = dir_get_folio(inode, n, &folio);
if (IS_ERR(kaddr))
continue;
de = (struct sysv_dir_entry *)(kaddr+offset);
@@ -103,11 +104,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
fs16_to_cpu(SYSV_SB(sb), de->inode),
DT_UNKNOWN)) {
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
return 0;
}
}
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
}
return 0;
}
@@ -126,39 +127,35 @@ static inline int namecompare(int len, int maxlen,
/*
* sysv_find_entry()
*
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * finds an entry in the specified directory with the wanted name.
+ * It does NOT read the inode of the
* entry - you'll have to do that yourself if you want to.
*
- * On Success unmap_and_put_page() should be called on *res_page.
+ * On Success folio_release_kmap() should be called on *foliop.
*
- * sysv_find_entry() acts as a call to dir_get_page() and must be treated
+ * sysv_find_entry() acts as a call to dir_get_folio() and must be treated
* accordingly for nesting purposes.
*/
-struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page)
+struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop)
{
const char * name = dentry->d_name.name;
int namelen = dentry->d_name.len;
struct inode * dir = d_inode(dentry->d_parent);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
struct sysv_dir_entry *de;
- *res_page = NULL;
-
start = SYSV_I(dir)->i_dir_start_lookup;
if (start >= npages)
start = 0;
n = start;
do {
- char *kaddr = dir_get_page(dir, n, &page);
+ char *kaddr = dir_get_folio(dir, n, foliop);
if (!IS_ERR(kaddr)) {
de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - SYSV_DIRSIZE;
+ kaddr += folio_size(*foliop) - SYSV_DIRSIZE;
for ( ; (char *) de <= kaddr ; de++) {
if (!de->inode)
continue;
@@ -166,7 +163,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
name, de->name))
goto found;
}
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(*foliop, kaddr);
}
if (++n >= npages)
@@ -177,7 +174,6 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
found:
SYSV_I(dir)->i_dir_start_lookup = n;
- *res_page = page;
return de;
}
@@ -186,7 +182,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode)
struct inode *dir = d_inode(dentry->d_parent);
const char * name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct sysv_dir_entry * de;
unsigned long npages = dir_pages(dir);
unsigned long n;
@@ -196,7 +192,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode)
/* We take care of directory expansion in the same loop */
for (n = 0; n <= npages; n++) {
- kaddr = dir_get_page(dir, n, &page);
+ kaddr = dir_get_folio(dir, n, &folio);
if (IS_ERR(kaddr))
return PTR_ERR(kaddr);
de = (struct sysv_dir_entry *)kaddr;
@@ -206,49 +202,49 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode)
goto got_it;
err = -EEXIST;
if (namecompare(namelen, SYSV_NAMELEN, name, de->name))
- goto out_page;
+ goto out_folio;
de++;
}
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
}
BUG();
return -EINVAL;
got_it:
- pos = page_offset(page) + offset_in_page(de);
- lock_page(page);
- err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+ pos = folio_pos(folio) + offset_in_folio(folio, de);
+ folio_lock(folio);
+ err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
if (err)
goto out_unlock;
memcpy (de->name, name, namelen);
memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+ dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = sysv_handle_dirsync(dir);
-out_page:
- unmap_and_put_page(page, kaddr);
+out_folio:
+ folio_release_kmap(folio, kaddr);
return err;
out_unlock:
- unlock_page(page);
- goto out_page;
+ folio_unlock(folio);
+ goto out_folio;
}
-int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
+int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- loff_t pos = page_offset(page) + offset_in_page(de);
+ struct inode *inode = folio->mapping->host;
+ loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
int err;
- lock_page(page);
- err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+ folio_lock(folio);
+ err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
de->inode = 0;
- dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+ dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
return sysv_handle_dirsync(inode);
@@ -256,33 +252,33 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
int sysv_make_empty(struct inode *inode, struct inode *dir)
{
- struct page *page = grab_cache_page(inode->i_mapping, 0);
+ struct folio *folio = filemap_grab_folio(inode->i_mapping, 0);
struct sysv_dir_entry * de;
- char *base;
+ char *kaddr;
int err;
- if (!page)
- return -ENOMEM;
- err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
goto fail;
}
- base = kmap_local_page(page);
- memset(base, 0, PAGE_SIZE);
+ kaddr = kmap_local_folio(folio, 0);
+ memset(kaddr, 0, folio_size(folio));
- de = (struct sysv_dir_entry *) base;
+ de = (struct sysv_dir_entry *)kaddr;
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
strcpy(de->name,".");
de++;
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino);
strcpy(de->name,"..");
- kunmap_local(base);
- dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE);
+ kunmap_local(kaddr);
+ dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE);
err = sysv_handle_dirsync(inode);
fail:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -292,19 +288,19 @@ fail:
int sysv_empty_dir(struct inode * inode)
{
struct super_block *sb = inode->i_sb;
- struct page *page = NULL;
+ struct folio *folio = NULL;
unsigned long i, npages = dir_pages(inode);
char *kaddr;
for (i = 0; i < npages; i++) {
struct sysv_dir_entry *de;
- kaddr = dir_get_page(inode, i, &page);
+ kaddr = dir_get_folio(inode, i, &folio);
if (IS_ERR(kaddr))
continue;
de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_SIZE-SYSV_DIRSIZE;
+ kaddr += folio_size(folio) - SYSV_DIRSIZE;
for ( ;(char *)de <= kaddr; de++) {
if (!de->inode)
@@ -321,46 +317,46 @@ int sysv_empty_dir(struct inode * inode)
if (de->name[1] != '.' || de->name[2])
goto not_empty;
}
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
}
return 1;
not_empty:
- unmap_and_put_page(page, kaddr);
+ folio_release_kmap(folio, kaddr);
return 0;
}
/* Releases the page */
-int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
- struct inode *inode)
+int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio,
+ struct inode *inode)
{
- struct inode *dir = page->mapping->host;
- loff_t pos = page_offset(page) + offset_in_page(de);
+ struct inode *dir = folio->mapping->host;
+ loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
int err;
- lock_page(page);
- err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
+ folio_lock(folio);
+ err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(page, pos, SYSV_DIRSIZE);
+ dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
return sysv_handle_dirsync(inode);
}
/*
- * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the
+ * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
* rules documented in mm/highmem.rst.
*
- * sysv_dotdot() acts as a call to dir_get_page() and must be treated
+ * sysv_dotdot() acts as a call to dir_get_folio() and must be treated
* accordingly for nesting purposes.
*/
-struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p)
+struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop)
{
- struct sysv_dir_entry *de = dir_get_page(dir, 0, p);
+ struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop);
if (IS_ERR(de))
return NULL;
@@ -370,13 +366,13 @@ struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p)
ino_t sysv_inode_by_name(struct dentry *dentry)
{
- struct page *page;
- struct sysv_dir_entry *de = sysv_find_entry (dentry, &page);
+ struct folio *folio;
+ struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio);
ino_t res = 0;
if (de) {
res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode);
- unmap_and_put_page(page, de);
+ folio_release_kmap(folio, de);
}
return res;
}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 19bcb51a2203..451e95f474fa 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -466,9 +466,9 @@ static int sysv_read_folio(struct file *file, struct folio *folio)
return block_read_full_folio(folio, get_block);
}
-int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- return __block_write_begin(page, pos, len, get_block);
+ return __block_write_begin(folio, pos, len, get_block);
}
static void sysv_write_failed(struct address_space *mapping, loff_t to)
@@ -483,11 +483,11 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
static int sysv_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, pagep, get_block);
+ ret = block_write_begin(mapping, pos, len, foliop, get_block);
if (unlikely(ret))
sysv_write_failed(mapping, pos + len);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index d6b73798071b..fb8bd8437872 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -151,20 +151,20 @@ out_dir:
static int sysv_unlink(struct inode * dir, struct dentry * dentry)
{
struct inode * inode = d_inode(dentry);
- struct page * page;
+ struct folio *folio;
struct sysv_dir_entry * de;
int err;
- de = sysv_find_entry(dentry, &page);
+ de = sysv_find_entry(dentry, &folio);
if (!de)
return -ENOENT;
- err = sysv_delete_entry(de, page);
+ err = sysv_delete_entry(de, folio);
if (!err) {
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
}
- unmap_and_put_page(page, de);
+ folio_release_kmap(folio, de);
return err;
}
@@ -194,28 +194,28 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
{
struct inode * old_inode = d_inode(old_dentry);
struct inode * new_inode = d_inode(new_dentry);
- struct page * dir_page = NULL;
+ struct folio *dir_folio;
struct sysv_dir_entry * dir_de = NULL;
- struct page * old_page;
+ struct folio *old_folio;
struct sysv_dir_entry * old_de;
int err = -ENOENT;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
- old_de = sysv_find_entry(old_dentry, &old_page);
+ old_de = sysv_find_entry(old_dentry, &old_folio);
if (!old_de)
goto out;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
- dir_de = sysv_dotdot(old_inode, &dir_page);
+ dir_de = sysv_dotdot(old_inode, &dir_folio);
if (!dir_de)
goto out_old;
}
if (new_inode) {
- struct page * new_page;
+ struct folio *new_folio;
struct sysv_dir_entry * new_de;
err = -ENOTEMPTY;
@@ -223,11 +223,11 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out_dir;
err = -ENOENT;
- new_de = sysv_find_entry(new_dentry, &new_page);
+ new_de = sysv_find_entry(new_dentry, &new_folio);
if (!new_de)
goto out_dir;
- err = sysv_set_link(new_de, new_page, old_inode);
- unmap_and_put_page(new_page, new_de);
+ err = sysv_set_link(new_de, new_folio, old_inode);
+ folio_release_kmap(new_folio, new_de);
if (err)
goto out_dir;
inode_set_ctime_current(new_inode);
@@ -242,23 +242,23 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inode_inc_link_count(new_dir);
}
- err = sysv_delete_entry(old_de, old_page);
+ err = sysv_delete_entry(old_de, old_folio);
if (err)
goto out_dir;
mark_inode_dirty(old_inode);
if (dir_de) {
- err = sysv_set_link(dir_de, dir_page, new_dir);
+ err = sysv_set_link(dir_de, dir_folio, new_dir);
if (!err)
inode_dec_link_count(old_dir);
}
out_dir:
if (dir_de)
- unmap_and_put_page(dir_page, dir_de);
+ folio_release_kmap(dir_folio, dir_de);
out_old:
- unmap_and_put_page(old_page, old_de);
+ folio_release_kmap(old_folio, old_de);
out:
return err;
}
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index e3f988b469ee..0a48b2e7edb1 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -133,8 +133,8 @@ extern void sysv_free_block(struct super_block *, sysv_zone_t);
extern unsigned long sysv_count_free_blocks(struct super_block *);
/* itree.c */
-extern void sysv_truncate(struct inode *);
-extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+void sysv_truncate(struct inode *);
+int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len);
/* inode.c */
extern struct inode *sysv_iget(struct super_block *, unsigned int);
@@ -148,15 +148,15 @@ extern void sysv_destroy_icache(void);
/* dir.c */
-extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **);
-extern int sysv_add_link(struct dentry *, struct inode *);
-extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *);
-extern int sysv_make_empty(struct inode *, struct inode *);
-extern int sysv_empty_dir(struct inode *);
-extern int sysv_set_link(struct sysv_dir_entry *, struct page *,
+struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **);
+int sysv_add_link(struct dentry *, struct inode *);
+int sysv_delete_entry(struct sysv_dir_entry *, struct folio *);
+int sysv_make_empty(struct inode *, struct inode *);
+int sysv_empty_dir(struct inode *);
+int sysv_set_link(struct sysv_dir_entry *, struct folio *,
struct inode *);
-extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **);
-extern ino_t sysv_inode_by_name(struct dentry *);
+struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **);
+ino_t sysv_inode_by_name(struct dentry *);
extern const struct inode_operations sysv_file_inode_operations;
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 5d88c184f0fc..8705c77a9e75 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -112,7 +112,7 @@ static void release_ei(struct kref *ref)
entry->release(entry->name, ei->data);
}
- call_rcu(&ei->rcu, free_ei_rcu);
+ call_srcu(&eventfs_srcu, &ei->rcu, free_ei_rcu);
}
static inline void put_ei(struct eventfs_inode *ei)
@@ -736,7 +736,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
/* Was the parent freed? */
if (list_empty(&ei->list)) {
cleanup_ei(ei);
- ei = NULL;
+ ei = ERR_PTR(-EBUSY);
}
return ei;
}
@@ -862,7 +862,7 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
list_for_each_entry(ei_child, &ei->children, list)
eventfs_remove_rec(ei_child, level + 1);
- list_del(&ei->list);
+ list_del_rcu(&ei->list);
free_ei(ei);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 1028ab6d9a74..1748dff58c3b 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -42,7 +42,7 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
struct tracefs_inode *ti;
unsigned long flags;
- ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL);
+ ti = alloc_inode_sb(sb, tracefs_inode_cachep, GFP_KERNEL);
if (!ti)
return NULL;
@@ -53,15 +53,14 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
return &ti->vfs_inode;
}
-static void tracefs_free_inode_rcu(struct rcu_head *rcu)
+static void tracefs_free_inode(struct inode *inode)
{
- struct tracefs_inode *ti;
+ struct tracefs_inode *ti = get_tracefs(inode);
- ti = container_of(rcu, struct tracefs_inode, rcu);
kmem_cache_free(tracefs_inode_cachep, ti);
}
-static void tracefs_free_inode(struct inode *inode)
+static void tracefs_destroy_inode(struct inode *inode)
{
struct tracefs_inode *ti = get_tracefs(inode);
unsigned long flags;
@@ -69,8 +68,6 @@ static void tracefs_free_inode(struct inode *inode)
spin_lock_irqsave(&tracefs_inode_lock, flags);
list_del_rcu(&ti->list);
spin_unlock_irqrestore(&tracefs_inode_lock, flags);
-
- call_rcu(&ti->rcu, tracefs_free_inode_rcu);
}
static ssize_t default_read_file(struct file *file, char __user *buf,
@@ -437,6 +434,7 @@ static int tracefs_drop_inode(struct inode *inode)
static const struct super_operations tracefs_super_operations = {
.alloc_inode = tracefs_alloc_inode,
.free_inode = tracefs_free_inode,
+ .destroy_inode = tracefs_destroy_inode,
.drop_inode = tracefs_drop_inode,
.statfs = simple_statfs,
.show_options = tracefs_show_options,
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index f704d8348357..d83c2a25f288 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -10,10 +10,7 @@ enum {
};
struct tracefs_inode {
- union {
- struct inode vfs_inode;
- struct rcu_head rcu;
- };
+ struct inode vfs_inode;
/* The below gets initialized with memset_after(ti, 0, vfs_inode) */
struct list_head list;
unsigned long flags;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c77ea57fe696..fda82f3e16e8 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -555,6 +555,11 @@ static unsigned int vfs_dent_type(uint8_t type)
return 0;
}
+struct ubifs_dir_data {
+ struct ubifs_dent_node *dent;
+ u64 cookie;
+};
+
/*
* The classical Unix view for directory is that it is a linear array of
* (name, inode number) entries. Linux/VFS assumes this model as well.
@@ -582,6 +587,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
bool encrypted = IS_ENCRYPTED(dir);
+ struct ubifs_dir_data *data = file->private_data;
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
@@ -604,27 +610,27 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
fstr_real_len = fstr.len;
}
- if (file->f_version == 0) {
+ if (data->cookie == 0) {
/*
- * The file was seek'ed, which means that @file->private_data
+ * The file was seek'ed, which means that @data->dent
* is now invalid. This may also be just the first
* 'ubifs_readdir()' invocation, in which case
- * @file->private_data is NULL, and the below code is
+ * @data->dent is NULL, and the below code is
* basically a no-op.
*/
- kfree(file->private_data);
- file->private_data = NULL;
+ kfree(data->dent);
+ data->dent = NULL;
}
/*
- * 'generic_file_llseek()' unconditionally sets @file->f_version to
- * zero, and we use this for detecting whether the file was seek'ed.
+ * 'ubifs_dir_llseek()' sets @data->cookie to zero, and we use this
+ * for detecting whether the file was seek'ed.
*/
- file->f_version = 1;
+ data->cookie = 1;
/* File positions 0 and 1 correspond to "." and ".." */
if (ctx->pos < 2) {
- ubifs_assert(c, !file->private_data);
+ ubifs_assert(c, !data->dent);
if (!dir_emit_dots(file, ctx)) {
if (encrypted)
fscrypt_fname_free_buffer(&fstr);
@@ -641,10 +647,10 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
ctx->pos = key_hash_flash(c, &dent->key);
- file->private_data = dent;
+ data->dent = dent;
}
- dent = file->private_data;
+ dent = data->dent;
if (!dent) {
/*
* The directory was seek'ed to and is now readdir'ed.
@@ -658,7 +664,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
goto out;
}
ctx->pos = key_hash_flash(c, &dent->key);
- file->private_data = dent;
+ data->dent = dent;
}
while (1) {
@@ -701,15 +707,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
goto out;
}
- kfree(file->private_data);
+ kfree(data->dent);
ctx->pos = key_hash_flash(c, &dent->key);
- file->private_data = dent;
+ data->dent = dent;
cond_resched();
}
out:
- kfree(file->private_data);
- file->private_data = NULL;
+ kfree(data->dent);
+ data->dent = NULL;
if (encrypted)
fscrypt_fname_free_buffer(&fstr);
@@ -733,7 +739,10 @@ out:
/* Free saved readdir() state when the directory is closed */
static int ubifs_dir_release(struct inode *dir, struct file *file)
{
- kfree(file->private_data);
+ struct ubifs_dir_data *data = file->private_data;
+
+ kfree(data->dent);
+ kfree(data);
file->private_data = NULL;
return 0;
}
@@ -1712,6 +1721,24 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path,
return 0;
}
+static int ubifs_dir_open(struct inode *inode, struct file *file)
+{
+ struct ubifs_dir_data *data;
+
+ data = kzalloc(sizeof(struct ubifs_dir_data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ file->private_data = data;
+ return 0;
+}
+
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct ubifs_dir_data *data = file->private_data;
+
+ return generic_llseek_cookie(file, offset, whence, &data->cookie);
+}
+
const struct inode_operations ubifs_dir_inode_operations = {
.lookup = ubifs_lookup,
.create = ubifs_create,
@@ -1732,7 +1759,8 @@ const struct inode_operations ubifs_dir_inode_operations = {
};
const struct file_operations ubifs_dir_operations = {
- .llseek = generic_file_llseek,
+ .open = ubifs_dir_open,
+ .llseek = ubifs_dir_llseek,
.release = ubifs_dir_release,
.read = generic_read_dir,
.iterate_shared = ubifs_readdir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 68e104423a48..5130123005e4 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -211,7 +211,7 @@ static void release_existing_page_budget(struct ubifs_info *c)
}
static int write_begin_slow(struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep)
+ loff_t pos, unsigned len, struct folio **foliop)
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -298,7 +298,7 @@ static int write_begin_slow(struct address_space *mapping,
ubifs_release_dirty_inode_budget(c, ui);
}
- *pagep = &folio->page;
+ *foliop = folio;
return 0;
}
@@ -414,7 +414,7 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio,
*/
static int ubifs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -483,7 +483,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- return write_begin_slow(mapping, pos, len, pagep);
+ return write_begin_slow(mapping, pos, len, foliop);
}
/*
@@ -492,7 +492,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
* with @ui->ui_mutex locked if we are appending pages, and unlocked
* otherwise. This is an optimization (slightly hacky though).
*/
- *pagep = &folio->page;
+ *foliop = folio;
return 0;
}
@@ -524,9 +524,8 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio,
static int ubifs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
- struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f94f45fe2c91..5023dfe191e8 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -60,7 +60,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
* identifying beginning of dir entry (names are under user control),
* we need to scan the directory from the beginning.
*/
- if (!inode_eq_iversion(dir, file->f_version)) {
+ if (!inode_eq_iversion(dir, *(u64 *)file->private_data)) {
emit_pos = nf_pos;
nf_pos = 0;
} else {
@@ -122,15 +122,37 @@ out_iter:
udf_fiiter_release(&iter);
out:
if (pos_valid)
- file->f_version = inode_query_iversion(dir);
+ *(u64 *)file->private_data = inode_query_iversion(dir);
kfree(fname);
return ret;
}
+static int udf_dir_open(struct inode *inode, struct file *file)
+{
+ file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int udf_dir_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static loff_t udf_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return generic_llseek_cookie(file, offset, whence,
+ (u64 *)file->private_data);
+}
+
/* readdir and lookup functions */
const struct file_operations udf_dir_operations = {
- .llseek = generic_file_llseek,
+ .open = udf_dir_open,
+ .release = udf_dir_release,
+ .llseek = udf_dir_llseek,
.read = generic_read_dir,
.iterate_shared = udf_readdir,
.unlocked_ioctl = udf_ioctl,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 3a4179de316b..412fe7c4d348 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -62,7 +62,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
end = size & ~PAGE_MASK;
else
end = PAGE_SIZE;
- err = __block_write_begin(&folio->page, 0, end, udf_get_block);
+ err = __block_write_begin(folio, 0, end, udf_get_block);
if (err) {
folio_unlock(folio);
ret = vmf_fs_error(err);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4726a4d014b6..eaee57b91c6c 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -246,14 +246,14 @@ static void udf_readahead(struct readahead_control *rac)
static int udf_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
struct udf_inode_info *iinfo = UDF_I(file_inode(file));
struct folio *folio;
int ret;
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
- ret = block_write_begin(mapping, pos, len, pagep,
+ ret = block_write_begin(mapping, pos, len, foliop,
udf_get_block);
if (unlikely(ret))
udf_write_failed(mapping, pos + len);
@@ -265,7 +265,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
- *pagep = &folio->page;
+ *foliop = folio;
if (!folio_test_uptodate(folio))
udf_adinicb_read_folio(folio);
return 0;
@@ -273,16 +273,14 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
static int udf_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = file_inode(file);
- struct folio *folio;
loff_t last_pos;
if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
- return generic_write_end(file, mapping, pos, len, copied, page,
+ return generic_write_end(file, mapping, pos, len, copied, folio,
fsdata);
- folio = page_folio(page);
last_pos = pos + copied;
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 61f25d3cf3f7..d6e6a2198971 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -42,18 +42,18 @@ static inline int ufs_match(struct super_block *sb, int len,
return !memcmp(name, de->d_name, len);
}
-static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, page, NULL);
+ block_write_end(NULL, mapping, pos, len, len, folio, NULL);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
- unlock_page(page);
+ folio_unlock(folio);
}
static int ufs_handle_dirsync(struct inode *dir)
@@ -66,22 +66,16 @@ static int ufs_handle_dirsync(struct inode *dir)
return err;
}
-static inline void ufs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
{
ino_t res = 0;
struct ufs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
- de = ufs_find_entry(dir, qstr, &page);
+ de = ufs_find_entry(dir, qstr, &folio);
if (de) {
res = fs32_to_cpu(dir->i_sb, de->d_ino);
- ufs_put_page(page);
+ folio_release_kmap(folio, de);
}
return res;
}
@@ -89,43 +83,40 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
/* Releases the page */
void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct page *page, struct inode *inode,
+ struct folio *folio, struct inode *inode,
bool update_times)
{
- loff_t pos = page_offset(page) +
- (char *) de - (char *) page_address(page);
+ loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen);
int err;
- lock_page(page);
- err = ufs_prepare_chunk(page, pos, len);
+ folio_lock(folio);
+ err = ufs_prepare_chunk(folio, pos, len);
BUG_ON(err);
de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
ufs_set_de_type(dir->i_sb, de, inode->i_mode);
- ufs_commit_chunk(page, pos, len);
- ufs_put_page(page);
+ ufs_commit_chunk(folio, pos, len);
+ folio_release_kmap(folio, de);
if (update_times)
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
ufs_handle_dirsync(dir);
}
-
-static bool ufs_check_page(struct page *page)
+static bool ufs_check_folio(struct folio *folio, char *kaddr)
{
- struct inode *dir = page->mapping->host;
+ struct inode *dir = folio->mapping->host;
struct super_block *sb = dir->i_sb;
- char *kaddr = page_address(page);
unsigned offs, rec_len;
- unsigned limit = PAGE_SIZE;
+ unsigned limit = folio_size(folio);
const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1;
struct ufs_dir_entry *p;
char *error;
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
+ if (dir->i_size < folio_pos(folio) + limit) {
+ limit = offset_in_folio(folio, dir->i_size);
if (limit & chunk_mask)
goto Ebadsize;
if (!limit)
@@ -150,13 +141,13 @@ static bool ufs_check_page(struct page *page)
if (offs != limit)
goto Eend;
out:
- SetPageChecked(page);
+ folio_set_checked(folio);
return true;
/* Too bad, we had an error */
Ebadsize:
- ufs_error(sb, "ufs_check_page",
+ ufs_error(sb, __func__,
"size of directory #%lu is not a multiple of chunk size",
dir->i_ino
);
@@ -176,36 +167,40 @@ Espan:
Einumber:
error = "inode out of bounds";
bad_entry:
- ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
- "offset=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
+ ufs_error(sb, __func__, "bad entry in directory #%lu: %s - "
+ "offset=%llu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, folio_pos(folio) + offs,
rec_len, ufs_get_de_namlen(sb, p));
goto fail;
Eend:
p = (struct ufs_dir_entry *)(kaddr + offs);
ufs_error(sb, __func__,
"entry in directory #%lu spans the page boundary"
- "offset=%lu",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs);
+ "offset=%llu",
+ dir->i_ino, folio_pos(folio) + offs);
fail:
return false;
}
-static struct page *ufs_get_page(struct inode *dir, unsigned long n)
+static void *ufs_get_folio(struct inode *dir, unsigned long n,
+ struct folio **foliop)
{
struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
- if (!IS_ERR(page)) {
- kmap(page);
- if (unlikely(!PageChecked(page))) {
- if (!ufs_check_page(page))
- goto fail;
- }
+ struct folio *folio = read_mapping_folio(mapping, n, NULL);
+ void *kaddr;
+
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
+ kaddr = kmap_local_folio(folio, 0);
+ if (unlikely(!folio_test_checked(folio))) {
+ if (!ufs_check_folio(folio, kaddr))
+ goto fail;
}
- return page;
+ *foliop = folio;
+ return kaddr;
fail:
- ufs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return ERR_PTR(-EIO);
}
@@ -231,17 +226,14 @@ ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
fs16_to_cpu(sb, p->d_reclen));
}
-struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct folio **foliop)
{
- struct page *page = ufs_get_page(dir, 0);
- struct ufs_dir_entry *de = NULL;
+ struct ufs_dir_entry *de = ufs_get_folio(dir, 0, foliop);
- if (!IS_ERR(page)) {
- de = ufs_next_entry(dir->i_sb,
- (struct ufs_dir_entry *)page_address(page));
- *p = page;
- }
- return de;
+ if (!IS_ERR(de))
+ return ufs_next_entry(dir->i_sb, de);
+
+ return NULL;
}
/*
@@ -253,7 +245,7 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
* Entry is guaranteed to be valid.
*/
struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
- struct page **res_page)
+ struct folio **foliop)
{
struct super_block *sb = dir->i_sb;
const unsigned char *name = qstr->name;
@@ -261,7 +253,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
unsigned reclen = UFS_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
struct ufs_inode_info *ui = UFS_I(dir);
struct ufs_dir_entry *de;
@@ -270,27 +261,23 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
if (npages == 0 || namelen > UFS_MAXNAMLEN)
goto out;
- /* OFFSET_CACHE */
- *res_page = NULL;
-
start = ui->i_dir_start_lookup;
if (start >= npages)
start = 0;
n = start;
do {
- char *kaddr;
- page = ufs_get_page(dir, n);
- if (!IS_ERR(page)) {
- kaddr = page_address(page);
- de = (struct ufs_dir_entry *) kaddr;
+ char *kaddr = ufs_get_folio(dir, n, foliop);
+
+ if (!IS_ERR(kaddr)) {
+ de = (struct ufs_dir_entry *)kaddr;
kaddr += ufs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
if (ufs_match(sb, namelen, name, de))
goto found;
de = ufs_next_entry(sb, de);
}
- ufs_put_page(page);
+ folio_release_kmap(*foliop, kaddr);
}
if (++n >= npages)
n = 0;
@@ -299,7 +286,6 @@ out:
return NULL;
found:
- *res_page = page;
ui->i_dir_start_lookup = n;
return de;
}
@@ -316,11 +302,10 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
unsigned reclen = UFS_DIR_REC_LEN(namelen);
const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize;
unsigned short rec_len, name_len;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct ufs_dir_entry *de;
unsigned long npages = dir_pages(dir);
unsigned long n;
- char *kaddr;
loff_t pos;
int err;
@@ -328,21 +313,19 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
/*
* We take care of directory expansion in the same loop.
- * This code plays outside i_size, so it locks the page
+ * This code plays outside i_size, so it locks the folio
* to protect that region.
*/
for (n = 0; n <= npages; n++) {
+ char *kaddr = ufs_get_folio(dir, n, &folio);
char *dir_end;
- page = ufs_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
- lock_page(page);
- kaddr = page_address(page);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
+ folio_lock(folio);
dir_end = kaddr + ufs_last_byte(dir, n);
de = (struct ufs_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - reclen;
+ kaddr += folio_size(folio) - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -369,16 +352,15 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
goto got_it;
de = (struct ufs_dir_entry *) ((char *) de + rec_len);
}
- unlock_page(page);
- ufs_put_page(page);
+ folio_unlock(folio);
+ folio_release_kmap(folio, kaddr);
}
BUG();
return -EINVAL;
got_it:
- pos = page_offset(page) +
- (char*)de - (char*)page_address(page);
- err = ufs_prepare_chunk(page, pos, rec_len);
+ pos = folio_pos(folio) + offset_in_folio(folio, de);
+ err = ufs_prepare_chunk(folio, pos, rec_len);
if (err)
goto out_unlock;
if (de->d_ino) {
@@ -395,18 +377,17 @@ got_it:
de->d_ino = cpu_to_fs32(sb, inode->i_ino);
ufs_set_de_type(sb, de, inode->i_mode);
- ufs_commit_chunk(page, pos, rec_len);
+ ufs_commit_chunk(folio, pos, rec_len);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
err = ufs_handle_dirsync(dir);
/* OFFSET_CACHE */
out_put:
- ufs_put_page(page);
-out:
+ folio_release_kmap(folio, de);
return err;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
goto out_put;
}
@@ -435,7 +416,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
- bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+ bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data);
unsigned flags = UFS_SB(sb)->s_flags;
UFSD("BEGIN\n");
@@ -444,25 +425,24 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
return 0;
for ( ; n < npages; n++, offset = 0) {
- char *kaddr, *limit;
struct ufs_dir_entry *de;
+ struct folio *folio;
+ char *kaddr = ufs_get_folio(inode, n, &folio);
+ char *limit;
- struct page *page = ufs_get_page(inode, n);
-
- if (IS_ERR(page)) {
+ if (IS_ERR(kaddr)) {
ufs_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
ctx->pos += PAGE_SIZE - offset;
- return -EIO;
+ return PTR_ERR(kaddr);
}
- kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
- file->f_version = inode_query_iversion(inode);
+ *(u64 *)file->private_data = inode_query_iversion(inode);
need_revalidate = false;
}
de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -482,13 +462,13 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
ufs_get_de_namlen(sb, de),
fs32_to_cpu(sb, de->d_ino),
d_type)) {
- ufs_put_page(page);
+ folio_release_kmap(folio, de);
return 0;
}
}
ctx->pos += fs16_to_cpu(sb, de->d_reclen);
}
- ufs_put_page(page);
+ folio_release_kmap(folio, kaddr);
}
return 0;
}
@@ -499,19 +479,23 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
* previous entry.
*/
int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
- struct page * page)
+ struct folio *folio)
{
struct super_block *sb = inode->i_sb;
- char *kaddr = page_address(page);
- unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
- unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+ size_t from, to;
+ char *kaddr;
loff_t pos;
- struct ufs_dir_entry *pde = NULL;
- struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+ struct ufs_dir_entry *de, *pde = NULL;
int err;
UFSD("ENTER\n");
+ from = offset_in_folio(folio, dir);
+ to = from + fs16_to_cpu(sb, dir->d_reclen);
+ kaddr = (char *)dir - from;
+ from &= ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
+ de = (struct ufs_dir_entry *) (kaddr + from);
+
UFSD("ino %u, reclen %u, namlen %u, name %s\n",
fs32_to_cpu(sb, de->d_ino),
fs16_to_cpu(sb, de->d_reclen),
@@ -528,21 +512,20 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
de = ufs_next_entry(sb, de);
}
if (pde)
- from = (char*)pde - (char*)page_address(page);
-
- pos = page_offset(page) + from;
- lock_page(page);
- err = ufs_prepare_chunk(page, pos, to - from);
+ from = offset_in_folio(folio, pde);
+ pos = folio_pos(folio) + from;
+ folio_lock(folio);
+ err = ufs_prepare_chunk(folio, pos, to - from);
BUG_ON(err);
if (pde)
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
- ufs_commit_chunk(page, pos, to - from);
+ ufs_commit_chunk(folio, pos, to - from);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
err = ufs_handle_dirsync(inode);
out:
- ufs_put_page(page);
+ folio_release_kmap(folio, kaddr);
UFSD("EXIT\n");
return err;
}
@@ -551,26 +534,25 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
{
struct super_block * sb = dir->i_sb;
struct address_space *mapping = inode->i_mapping;
- struct page *page = grab_cache_page(mapping, 0);
+ struct folio *folio = filemap_grab_folio(mapping, 0);
const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize;
struct ufs_dir_entry * de;
- char *base;
int err;
+ char *kaddr;
- if (!page)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- err = ufs_prepare_chunk(page, 0, chunk_size);
+ err = ufs_prepare_chunk(folio, 0, chunk_size);
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
goto fail;
}
- kmap(page);
- base = (char*)page_address(page);
- memset(base, 0, PAGE_SIZE);
+ kaddr = kmap_local_folio(folio, 0);
+ memset(kaddr, 0, folio_size(folio));
- de = (struct ufs_dir_entry *) base;
+ de = (struct ufs_dir_entry *)kaddr;
de->d_ino = cpu_to_fs32(sb, inode->i_ino);
ufs_set_de_type(sb, de, inode->i_mode);
@@ -584,12 +566,12 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1));
ufs_set_de_namlen(sb, de, 2);
strcpy (de->d_name, "..");
- kunmap(page);
+ kunmap_local(kaddr);
- ufs_commit_chunk(page, 0, chunk_size);
+ ufs_commit_chunk(folio, 0, chunk_size);
err = ufs_handle_dirsync(inode);
fail:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -599,18 +581,17 @@ fail:
int ufs_empty_dir(struct inode * inode)
{
struct super_block *sb = inode->i_sb;
- struct page *page = NULL;
+ struct folio *folio;
+ char *kaddr;
unsigned long i, npages = dir_pages(inode);
for (i = 0; i < npages; i++) {
- char *kaddr;
struct ufs_dir_entry *de;
- page = ufs_get_page(inode, i);
- if (IS_ERR(page))
+ kaddr = ufs_get_folio(inode, i, &folio);
+ if (IS_ERR(kaddr))
continue;
- kaddr = page_address(page);
de = (struct ufs_dir_entry *)kaddr;
kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
@@ -637,18 +618,40 @@ int ufs_empty_dir(struct inode * inode)
}
de = ufs_next_entry(sb, de);
}
- ufs_put_page(page);
+ folio_release_kmap(folio, kaddr);
}
return 1;
not_empty:
- ufs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return 0;
}
+static int ufs_dir_open(struct inode *inode, struct file *file)
+{
+ file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int ufs_dir_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static loff_t ufs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ return generic_llseek_cookie(file, offset, whence,
+ (u64 *)file->private_data);
+}
+
const struct file_operations ufs_dir_operations = {
+ .open = ufs_dir_open,
+ .release = ufs_dir_release,
.read = generic_read_dir,
.iterate_shared = ufs_readdir,
.fsync = generic_file_fsync,
- .llseek = generic_file_llseek,
+ .llseek = ufs_dir_llseek,
};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a7bb2e63cdde..5331ae7ebf3e 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -479,9 +479,9 @@ static int ufs_read_folio(struct file *file, struct folio *folio)
return block_read_full_folio(folio, ufs_getfrag_block);
}
-int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
+int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
{
- return __block_write_begin(page, pos, len, ufs_getfrag_block);
+ return __block_write_begin(folio, pos, len, ufs_getfrag_block);
}
static void ufs_truncate_blocks(struct inode *);
@@ -498,11 +498,11 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
static int ufs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = block_write_begin(mapping, pos, len, pagep, ufs_getfrag_block);
+ ret = block_write_begin(mapping, pos, len, foliop, ufs_getfrag_block);
if (unlikely(ret))
ufs_write_failed(mapping, pos + len);
@@ -511,11 +511,11 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
static int ufs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
ufs_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 9cad29463791..24bd12186647 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -209,14 +209,14 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode * inode = d_inode(dentry);
struct ufs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
int err = -ENOENT;
- de = ufs_find_entry(dir, &dentry->d_name, &page);
+ de = ufs_find_entry(dir, &dentry->d_name, &folio);
if (!de)
goto out;
- err = ufs_delete_entry(dir, de, page);
+ err = ufs_delete_entry(dir, de, folio);
if (err)
goto out;
@@ -249,28 +249,28 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct page *dir_page = NULL;
+ struct folio *dir_folio = NULL;
struct ufs_dir_entry * dir_de = NULL;
- struct page *old_page;
+ struct folio *old_folio;
struct ufs_dir_entry *old_de;
int err = -ENOENT;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
- old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_de)
goto out;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
- dir_de = ufs_dotdot(old_inode, &dir_page);
+ dir_de = ufs_dotdot(old_inode, &dir_folio);
if (!dir_de)
goto out_old;
}
if (new_inode) {
- struct page *new_page;
+ struct folio *new_folio;
struct ufs_dir_entry *new_de;
err = -ENOTEMPTY;
@@ -278,10 +278,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out_dir;
err = -ENOENT;
- new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+ new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
if (!new_de)
goto out_dir;
- ufs_set_link(new_dir, new_de, new_page, old_inode, 1);
+ ufs_set_link(new_dir, new_de, new_folio, old_inode, 1);
inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
@@ -300,29 +300,24 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
*/
inode_set_ctime_current(old_inode);
- ufs_delete_entry(old_dir, old_de, old_page);
+ ufs_delete_entry(old_dir, old_de, old_folio);
mark_inode_dirty(old_inode);
if (dir_de) {
if (old_dir != new_dir)
- ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0);
- else {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0);
+ else
+ folio_release_kmap(dir_folio, new_dir);
inode_dec_link_count(old_dir);
}
return 0;
out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ if (dir_de)
+ folio_release_kmap(dir_folio, dir_de);
out_old:
- kunmap(old_page);
- put_page(old_page);
+ folio_release_kmap(old_folio, old_de);
out:
return err;
}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 6b499180643b..a2c762cb65a0 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,15 +99,17 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
/* dir.c */
extern const struct inode_operations ufs_dir_inode_operations;
-extern int ufs_add_link (struct dentry *, struct inode *);
-extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
-extern int ufs_make_empty(struct inode *, struct inode *);
-extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
-extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
-extern int ufs_empty_dir (struct inode *);
-extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
-extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct page *page, struct inode *inode, bool update_times);
+
+int ufs_add_link(struct dentry *, struct inode *);
+ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
+int ufs_make_empty(struct inode *, struct inode *);
+struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *,
+ struct folio **);
+int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *);
+int ufs_empty_dir(struct inode *);
+struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **);
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+ struct folio *folio, struct inode *inode, bool update_times);
/* file.c */
extern const struct inode_operations ufs_file_inode_operations;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 0ecd2ed792f5..bf708b68f150 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -250,9 +250,9 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value)
}
}
-extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
-extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
-extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len);
+dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *);
+void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t);
+int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len);
/*
* These functions manipulate ufs buffers
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index fdb4da24d662..b780deb81b02 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -300,23 +300,23 @@ static int vboxsf_writepage(struct page *page, struct writeback_control *wbc)
static int vboxsf_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned int len, unsigned int copied,
- struct page *page, void *fsdata)
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct vboxsf_handle *sf_handle = file->private_data;
- unsigned int from = pos & ~PAGE_MASK;
+ size_t from = offset_in_folio(folio, pos);
u32 nwritten = len;
u8 *buf;
int err;
- /* zero the stale part of the page if we did a short copy */
- if (!PageUptodate(page) && copied < len)
- zero_user(page, from + copied, len - copied);
+ /* zero the stale part of the folio if we did a short copy */
+ if (!folio_test_uptodate(folio) && copied < len)
+ folio_zero_range(folio, from + copied, len - copied);
- buf = kmap(page);
+ buf = kmap(&folio->page);
err = vboxsf_write(sf_handle->root, sf_handle->handle,
pos, &nwritten, buf + from);
- kunmap(page);
+ kunmap(&folio->page);
if (err) {
nwritten = 0;
@@ -326,16 +326,16 @@ static int vboxsf_write_end(struct file *file, struct address_space *mapping,
/* mtime changed */
VBOXSF_I(inode)->force_restat = 1;
- if (!PageUptodate(page) && nwritten == PAGE_SIZE)
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio) && nwritten == folio_size(folio))
+ folio_mark_uptodate(folio);
pos += nwritten;
if (pos > inode->i_size)
i_size_write(inode, pos);
out:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return nwritten;
}
@@ -343,7 +343,7 @@ out:
/*
* Note simple_write_begin does not read the page from disk on partial writes
* this is ok since vboxsf_write_end only writes the written parts of the
- * page and it does not call SetPageUptodate for partial writes.
+ * page and it does not call folio_mark_uptodate for partial writes.
*/
const struct address_space_operations vboxsf_reg_aops = {
.read_folio = vboxsf_read_folio,
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index 90c07573dd77..0302a4e506ec 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -17,6 +17,7 @@
#include <linux/cred.h>
#include <linux/key.h>
+#include <linux/security.h>
#include <linux/slab.h>
#include <linux/verification.h>
@@ -41,7 +42,11 @@ static struct key *fsverity_keyring;
* @sig_size: size of signature in bytes, or 0 if no signature
*
* If the file includes a signature of its fs-verity file digest, verify it
- * against the certificates in the fs-verity keyring.
+ * against the certificates in the fs-verity keyring. Note that signatures
+ * are verified regardless of the state of the 'fsverity_require_signatures'
+ * variable and the LSM subsystem relies on this behavior to help enforce
+ * file integrity policies. Please discuss changes with the LSM list
+ * (thank you!).
*
* Return: 0 on success (signature valid or not required); -errno on failure
*/
@@ -106,6 +111,17 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return err;
}
+ err = security_inode_setintegrity(inode,
+ LSM_INT_FSVERITY_BUILTINSIG_VALID,
+ signature,
+ sig_size);
+
+ if (err) {
+ fsverity_err(inode, "Error %d exposing file signature to LSMs",
+ err);
+ return err;
+ }
+
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 7e80732cb547..5f0494702e0b 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -46,7 +46,7 @@ xfs_perag_get(
struct xfs_perag *pag;
rcu_read_lock();
- pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ pag = xa_load(&mp->m_perags, agno);
if (pag) {
trace_xfs_perag_get(pag, _RET_IP_);
ASSERT(atomic_read(&pag->pag_ref) >= 0);
@@ -56,31 +56,6 @@ xfs_perag_get(
return pag;
}
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_get_tag(
- struct xfs_mount *mp,
- xfs_agnumber_t first,
- unsigned int tag)
-{
- struct xfs_perag *pag;
- int found;
-
- rcu_read_lock();
- found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
- (void **)&pag, first, 1, tag);
- if (found <= 0) {
- rcu_read_unlock();
- return NULL;
- }
- trace_xfs_perag_get_tag(pag, _RET_IP_);
- atomic_inc(&pag->pag_ref);
- rcu_read_unlock();
- return pag;
-}
-
/* Get a passive reference to the given perag. */
struct xfs_perag *
xfs_perag_hold(
@@ -117,7 +92,7 @@ xfs_perag_grab(
struct xfs_perag *pag;
rcu_read_lock();
- pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+ pag = xa_load(&mp->m_perags, agno);
if (pag) {
trace_xfs_perag_grab(pag, _RET_IP_);
if (!atomic_inc_not_zero(&pag->pag_active_ref))
@@ -127,32 +102,6 @@ xfs_perag_grab(
return pag;
}
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_grab_tag(
- struct xfs_mount *mp,
- xfs_agnumber_t first,
- int tag)
-{
- struct xfs_perag *pag;
- int found;
-
- rcu_read_lock();
- found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
- (void **)&pag, first, 1, tag);
- if (found <= 0) {
- rcu_read_unlock();
- return NULL;
- }
- trace_xfs_perag_grab_tag(pag, _RET_IP_);
- if (!atomic_inc_not_zero(&pag->pag_active_ref))
- pag = NULL;
- rcu_read_unlock();
- return pag;
-}
-
void
xfs_perag_rele(
struct xfs_perag *pag)
@@ -235,16 +184,6 @@ out:
return error;
}
-STATIC void
-__xfs_free_perag(
- struct rcu_head *head)
-{
- struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
-
- ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
- kfree(pag);
-}
-
/*
* Free up the per-ag resources associated with the mount structure.
*/
@@ -256,9 +195,7 @@ xfs_free_perag(
xfs_agnumber_t agno;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- spin_lock(&mp->m_perag_lock);
- pag = radix_tree_delete(&mp->m_perag_tree, agno);
- spin_unlock(&mp->m_perag_lock);
+ pag = xa_erase(&mp->m_perags, agno);
ASSERT(pag);
XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
xfs_defer_drain_free(&pag->pag_intents_drain);
@@ -270,7 +207,7 @@ xfs_free_perag(
xfs_perag_rele(pag);
XFS_IS_CORRUPT(pag->pag_mount,
atomic_read(&pag->pag_active_ref) != 0);
- call_rcu(&pag->rcu_head, __xfs_free_perag);
+ kfree_rcu_mightsleep(pag);
}
}
@@ -347,9 +284,7 @@ xfs_free_unused_perag_range(
xfs_agnumber_t index;
for (index = agstart; index < agend; index++) {
- spin_lock(&mp->m_perag_lock);
- pag = radix_tree_delete(&mp->m_perag_tree, index);
- spin_unlock(&mp->m_perag_lock);
+ pag = xa_erase(&mp->m_perags, index);
if (!pag)
break;
xfs_buf_cache_destroy(&pag->pag_bcache);
@@ -390,20 +325,11 @@ xfs_initialize_perag(
pag->pag_agno = index;
pag->pag_mount = mp;
- error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
- if (error)
- goto out_free_pag;
-
- spin_lock(&mp->m_perag_lock);
- if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
- WARN_ON_ONCE(1);
- spin_unlock(&mp->m_perag_lock);
- radix_tree_preload_end();
- error = -EEXIST;
+ error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL);
+ if (error) {
+ WARN_ON_ONCE(error == -EBUSY);
goto out_free_pag;
}
- spin_unlock(&mp->m_perag_lock);
- radix_tree_preload_end();
#ifdef __KERNEL__
/* Place kernel structure only init below this point. */
@@ -451,9 +377,7 @@ xfs_initialize_perag(
out_remove_pag:
xfs_defer_drain_free(&pag->pag_intents_drain);
- spin_lock(&mp->m_perag_lock);
- radix_tree_delete(&mp->m_perag_tree, index);
- spin_unlock(&mp->m_perag_lock);
+ pag = xa_erase(&mp->m_perags, index);
out_free_pag:
kfree(pag);
out_unwind_new_pags:
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 35de09a2516c..d9cccd093b60 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -63,9 +63,6 @@ struct xfs_perag {
/* Blocks reserved for the reverse mapping btree. */
struct xfs_ag_resv pag_rmapbt_resv;
- /* for rcu-safe freeing */
- struct rcu_head rcu_head;
-
/* Precalculated geometry info */
xfs_agblock_t block_count;
xfs_agblock_t min_block;
@@ -156,15 +153,11 @@ void xfs_free_perag(struct xfs_mount *mp);
/* Passive AG references */
struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
- unsigned int tag);
struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag);
void xfs_perag_put(struct xfs_perag *pag);
/* Active AG references */
struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t);
-struct xfs_perag *xfs_perag_grab_tag(struct xfs_mount *, xfs_agnumber_t,
- int tag);
void xfs_perag_rele(struct xfs_perag *pag);
/*
@@ -266,13 +259,6 @@ xfs_perag_next(
(agno) = 0; \
for_each_perag_from((mp), (agno), (pag))
-#define for_each_perag_tag(mp, agno, pag, tag) \
- for ((agno) = 0, (pag) = xfs_perag_grab_tag((mp), 0, (tag)); \
- (pag) != NULL; \
- (agno) = (pag)->pag_agno + 1, \
- xfs_perag_rele(pag), \
- (pag) = xfs_perag_grab_tag((mp), (agno), (tag)))
-
static inline struct xfs_perag *
xfs_perag_next_wrap(
struct xfs_perag *pag,
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 585e98e87ef9..aada676eee51 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -569,11 +569,11 @@ xfs_allocbt_block_maxrecs(
/*
* Calculate number of records in an alloc btree block.
*/
-int
+unsigned int
xfs_allocbt_maxrecs(
struct xfs_mount *mp,
- int blocklen,
- int leaf)
+ unsigned int blocklen,
+ bool leaf)
{
blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
return xfs_allocbt_block_maxrecs(blocklen, leaf);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 155b47f231ab..12647f9aaa6d 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -53,7 +53,8 @@ struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp,
struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_perag *pag);
-extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+unsigned int xfs_allocbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index b9e98950eb3d..6aaec1246c95 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -686,7 +686,7 @@ xfs_attr_shortform_bytesfit(
*/
if (!dp->i_forkoff && dp->i_df.if_bytes >
xfs_default_attroffset(dp))
- dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ dsize = xfs_bmdr_space_calc(MINDBTPTRS);
break;
case XFS_DINODE_FMT_BTREE:
/*
@@ -700,7 +700,7 @@ xfs_attr_shortform_bytesfit(
return 0;
return dp->i_forkoff;
}
- dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
+ dsize = xfs_bmap_bmdr_space(dp->i_df.if_broot);
break;
}
@@ -708,11 +708,11 @@ xfs_attr_shortform_bytesfit(
* A data fork btree root must have space for at least
* MINDBTPTRS key/ptr pairs if the data fork is small or empty.
*/
- minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+ minforkoff = max_t(int64_t, dsize, xfs_bmdr_space_calc(MINDBTPTRS));
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
if (offset >= maxforkoff)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 7df74c35d9f9..8090e8249116 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -79,9 +79,9 @@ xfs_bmap_compute_maxlevels(
maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
whichfork);
if (whichfork == XFS_DATA_FORK)
- sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+ sz = xfs_bmdr_space_calc(MINDBTPTRS);
else
- sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ sz = xfs_bmdr_space_calc(MINABTPTRS);
maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
@@ -102,8 +102,8 @@ xfs_bmap_compute_attr_offset(
struct xfs_mount *mp)
{
if (mp->m_sb.sb_inodesize == 256)
- return XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
- return XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+ return XFS_LITINO(mp) - xfs_bmdr_space_calc(MINABTPTRS);
+ return xfs_bmdr_space_calc(6 * MINABTPTRS);
}
STATIC int /* error */
@@ -298,7 +298,7 @@ xfs_check_block(
prevp = NULL;
for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
dmxr = mp->m_bmap_dmxr[0];
- keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+ keyp = xfs_bmbt_key_addr(mp, block, i);
if (prevp) {
ASSERT(be64_to_cpu(prevp->br_startoff) <
@@ -310,15 +310,15 @@ xfs_check_block(
* Compare the block numbers to see if there are dups.
*/
if (root)
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+ pp = xfs_bmap_broot_ptr_addr(mp, block, i, sz);
else
- pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+ pp = xfs_bmbt_ptr_addr(mp, block, i, dmxr);
for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
if (root)
- thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+ thispa = xfs_bmap_broot_ptr_addr(mp, block, j, sz);
else
- thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+ thispa = xfs_bmbt_ptr_addr(mp, block, j, dmxr);
if (*thispa == *pp) {
xfs_warn(mp, "%s: thispa(%d) == pp(%d) %lld",
__func__, j, i,
@@ -373,7 +373,7 @@ xfs_bmap_check_leaf_extents(
level = be16_to_cpu(block->bb_level);
ASSERT(level > 0);
xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ pp = xfs_bmap_broot_ptr_addr(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
ASSERT(bno != NULLFSBLOCK);
@@ -406,7 +406,7 @@ xfs_bmap_check_leaf_extents(
*/
xfs_check_block(block, mp, 0, 0);
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+ pp = xfs_bmbt_ptr_addr(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
xfs_btree_mark_sick(cur);
@@ -446,14 +446,14 @@ xfs_bmap_check_leaf_extents(
* conform with the first entry in this one.
*/
- ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+ ep = xfs_bmbt_rec_addr(mp, block, 1);
if (i) {
ASSERT(xfs_bmbt_disk_get_startoff(&last) +
xfs_bmbt_disk_get_blockcount(&last) <=
xfs_bmbt_disk_get_startoff(ep));
}
for (j = 1; j < num_recs; j++) {
- nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+ nextp = xfs_bmbt_rec_addr(mp, block, j + 1);
ASSERT(xfs_bmbt_disk_get_startoff(ep) +
xfs_bmbt_disk_get_blockcount(ep) <=
xfs_bmbt_disk_get_startoff(nextp));
@@ -584,9 +584,9 @@ xfs_bmap_btree_to_extents(
ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
- ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+ ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false) == 1);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+ pp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
#ifdef DEBUG
if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) {
@@ -714,7 +714,7 @@ xfs_bmap_extents_to_btree(
for_each_xfs_iext(ifp, &icur, &rec) {
if (isnullstartblock(rec.br_startblock))
continue;
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1 + cnt);
+ arp = xfs_bmbt_rec_addr(mp, ablock, 1 + cnt);
xfs_bmbt_disk_set_all(arp, &rec);
cnt++;
}
@@ -724,10 +724,10 @@ xfs_bmap_extents_to_btree(
/*
* Fill in the root key and pointer.
*/
- kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
- arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+ kp = xfs_bmbt_key_addr(mp, block, 1);
+ arp = xfs_bmbt_rec_addr(mp, ablock, 1);
kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+ pp = xfs_bmbt_ptr_addr(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
be16_to_cpu(block->bb_level)));
*pp = cpu_to_be64(args.fsbno);
@@ -896,7 +896,7 @@ xfs_bmap_add_attrfork_btree(
mp = ip->i_mount;
- if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip))
+ if (xfs_bmap_bmdr_space(block) <= xfs_inode_data_fork_size(ip))
*flags |= XFS_ILOG_DBROOT;
else {
cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
@@ -1160,7 +1160,7 @@ xfs_iread_bmbt_block(
}
/* Copy records into the incore cache. */
- frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ frp = xfs_bmbt_rec_addr(mp, block, 1);
for (j = 0; j < num_recs; j++, frp++, ir->loaded++) {
struct xfs_bmbt_irec new;
xfs_failaddr_t fa;
@@ -3112,6 +3112,23 @@ xfs_bmap_extsize_align(
return 0;
}
+static inline bool
+xfs_bmap_adjacent_valid(
+ struct xfs_bmalloca *ap,
+ xfs_fsblock_t x,
+ xfs_fsblock_t y)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+
+ if (XFS_IS_REALTIME_INODE(ap->ip) &&
+ (ap->datatype & XFS_ALLOC_USERDATA))
+ return x < mp->m_sb.sb_rblocks;
+
+ return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) &&
+ XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount &&
+ XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks;
+}
+
#define XFS_ALLOC_GAP_UNITS 4
/* returns true if ap->blkno was modified */
@@ -3119,36 +3136,25 @@ bool
xfs_bmap_adjacent(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
- xfs_fsblock_t adjust; /* adjustment to block numbers */
- xfs_mount_t *mp; /* mount point structure */
- int rt; /* true if inode is realtime */
-
-#define ISVALID(x,y) \
- (rt ? \
- (x) < mp->m_sb.sb_rblocks : \
- XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
- XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
- XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
-
- mp = ap->ip->i_mount;
- rt = XFS_IS_REALTIME_INODE(ap->ip) &&
- (ap->datatype & XFS_ALLOC_USERDATA);
+ xfs_fsblock_t adjust; /* adjustment to block numbers */
+
/*
* If allocating at eof, and there's a previous real block,
* try to use its last block as our starting point.
*/
if (ap->eof && ap->prev.br_startoff != NULLFILEOFF &&
!isnullstartblock(ap->prev.br_startblock) &&
- ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount,
- ap->prev.br_startblock)) {
+ xfs_bmap_adjacent_valid(ap,
+ ap->prev.br_startblock + ap->prev.br_blockcount,
+ ap->prev.br_startblock)) {
ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount;
/*
* Adjust for the gap between prevp and us.
*/
adjust = ap->offset -
(ap->prev.br_startoff + ap->prev.br_blockcount);
- if (adjust &&
- ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
+ if (adjust && xfs_bmap_adjacent_valid(ap, ap->blkno + adjust,
+ ap->prev.br_startblock))
ap->blkno += adjust;
return true;
}
@@ -3171,7 +3177,8 @@ xfs_bmap_adjacent(
!isnullstartblock(ap->prev.br_startblock) &&
(prevbno = ap->prev.br_startblock +
ap->prev.br_blockcount) &&
- ISVALID(prevbno, ap->prev.br_startblock)) {
+ xfs_bmap_adjacent_valid(ap, prevbno,
+ ap->prev.br_startblock)) {
/*
* Calculate gap to end of previous block.
*/
@@ -3187,8 +3194,8 @@ xfs_bmap_adjacent(
* number, then just use the end of the previous block.
*/
if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
- ISVALID(prevbno + prevdiff,
- ap->prev.br_startblock))
+ xfs_bmap_adjacent_valid(ap, prevbno + prevdiff,
+ ap->prev.br_startblock))
prevbno += adjust;
else
prevdiff += adjust;
@@ -3220,9 +3227,11 @@ xfs_bmap_adjacent(
* offset by our length.
*/
if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length &&
- ISVALID(gotbno - gotdiff, gotbno))
+ xfs_bmap_adjacent_valid(ap, gotbno - gotdiff,
+ gotbno))
gotbno -= adjust;
- else if (ISVALID(gotbno - ap->length, gotbno)) {
+ else if (xfs_bmap_adjacent_valid(ap, gotbno - ap->length,
+ gotbno)) {
gotbno -= ap->length;
gotdiff += adjust - ap->length;
} else
@@ -3250,7 +3259,7 @@ xfs_bmap_adjacent(
return true;
}
}
-#undef ISVALID
+
return false;
}
@@ -4847,6 +4856,7 @@ xfs_bmapi_remap(
}
ip->i_nblocks += len;
+ ip->i_delayed_blks -= len; /* see xfs_bmap_defer_add */
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (ifp->if_format == XFS_DINODE_FMT_BTREE)
@@ -5376,7 +5386,8 @@ xfs_bmap_del_extent_real(
*/
if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) {
tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED;
- xfs_rtbitmap_lock(tp, mp);
+ xfs_rtbitmap_lock(mp);
+ xfs_rtbitmap_trans_join(tp);
}
error = xfs_rtfree_blocks(tp, del->br_startblock,
del->br_blockcount);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index d1b06ccde19e..3464be771f95 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -65,10 +65,10 @@ xfs_bmdr_to_bmbt(
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
- fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
- tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
- fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
- tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+ fkp = xfs_bmdr_key_addr(dblock, 1);
+ tkp = xfs_bmbt_key_addr(mp, rblock, 1);
+ fpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr);
+ tpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
dmxr = be16_to_cpu(dblock->bb_numrecs);
memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
@@ -168,10 +168,10 @@ xfs_bmbt_to_bmdr(
dblock->bb_level = rblock->bb_level;
dblock->bb_numrecs = rblock->bb_numrecs;
dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
- fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
- tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
- fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
- tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+ fkp = xfs_bmbt_key_addr(mp, rblock, 1);
+ tkp = xfs_bmdr_key_addr(dblock, 1);
+ fpp = xfs_bmap_broot_ptr_addr(mp, rblock, 1, rblocklen);
+ tpp = xfs_bmdr_ptr_addr(dblock, 1, dmxr);
dmxr = be16_to_cpu(dblock->bb_numrecs);
memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
@@ -645,13 +645,13 @@ xfs_bmbt_commit_staged_btree(
/*
* Calculate number of records in a bmap btree block.
*/
-int
+unsigned int
xfs_bmbt_maxrecs(
struct xfs_mount *mp,
- int blocklen,
- int leaf)
+ unsigned int blocklen,
+ bool leaf)
{
- blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+ blocklen -= xfs_bmbt_block_len(mp);
return xfs_bmbt_block_maxrecs(blocklen, leaf);
}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index de1b73f1225c..49a3bae3f6ec 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -14,70 +14,6 @@ struct xfs_trans;
struct xbtree_ifakeroot;
/*
- * Btree block header size depends on a superblock flag.
- */
-#define XFS_BMBT_BLOCK_LEN(mp) \
- (xfs_has_crc(((mp))) ? \
- XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
-
-#define XFS_BMBT_REC_ADDR(mp, block, index) \
- ((xfs_bmbt_rec_t *) \
- ((char *)(block) + \
- XFS_BMBT_BLOCK_LEN(mp) + \
- ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-
-#define XFS_BMBT_KEY_ADDR(mp, block, index) \
- ((xfs_bmbt_key_t *) \
- ((char *)(block) + \
- XFS_BMBT_BLOCK_LEN(mp) + \
- ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-
-#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
- ((xfs_bmbt_ptr_t *) \
- ((char *)(block) + \
- XFS_BMBT_BLOCK_LEN(mp) + \
- (maxrecs) * sizeof(xfs_bmbt_key_t) + \
- ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-
-#define XFS_BMDR_REC_ADDR(block, index) \
- ((xfs_bmdr_rec_t *) \
- ((char *)(block) + \
- sizeof(struct xfs_bmdr_block) + \
- ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-
-#define XFS_BMDR_KEY_ADDR(block, index) \
- ((xfs_bmdr_key_t *) \
- ((char *)(block) + \
- sizeof(struct xfs_bmdr_block) + \
- ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-
-#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
- ((xfs_bmdr_ptr_t *) \
- ((char *)(block) + \
- sizeof(struct xfs_bmdr_block) + \
- (maxrecs) * sizeof(xfs_bmdr_key_t) + \
- ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
-
-/*
- * These are to be used when we know the size of the block and
- * we don't have a cursor.
- */
-#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
- XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-
-#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
- (int)(XFS_BMBT_BLOCK_LEN(mp) + \
- ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-
-#define XFS_BMAP_BROOT_SPACE(mp, bb) \
- (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
-#define XFS_BMDR_SPACE_CALC(nrecs) \
- (int)(sizeof(xfs_bmdr_block_t) + \
- ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
-#define XFS_BMAP_BMDR_SPACE(bb) \
- (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
-
-/*
* Maximum number of bmap btree levels.
*/
#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
@@ -99,7 +35,8 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
-extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+unsigned int xfs_bmbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, xfs_ino_t new_owner,
@@ -121,4 +58,144 @@ void xfs_bmbt_destroy_cur_cache(void);
void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf,
struct xfs_buf *bp, __u16 level, __u16 numrecs);
+/*
+ * Btree block header size depends on a superblock flag.
+ */
+static inline size_t
+xfs_bmbt_block_len(struct xfs_mount *mp)
+{
+ return xfs_has_crc(mp) ?
+ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN;
+}
+
+/* Addresses of key, pointers, and records within an incore bmbt block. */
+
+static inline struct xfs_bmbt_rec *
+xfs_bmbt_rec_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_bmbt_rec *)
+ ((char *)block + xfs_bmbt_block_len(mp) +
+ (index - 1) * sizeof(struct xfs_bmbt_rec));
+}
+
+static inline struct xfs_bmbt_key *
+xfs_bmbt_key_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ unsigned int index)
+{
+ return (struct xfs_bmbt_key *)
+ ((char *)block + xfs_bmbt_block_len(mp) +
+ (index - 1) * sizeof(struct xfs_bmbt_key *));
+}
+
+static inline xfs_bmbt_ptr_t *
+xfs_bmbt_ptr_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_bmbt_ptr_t *)
+ ((char *)block + xfs_bmbt_block_len(mp) +
+ maxrecs * sizeof(struct xfs_bmbt_key) +
+ (index - 1) * sizeof(xfs_bmbt_ptr_t));
+}
+
+/* Addresses of key, pointers, and records within an ondisk bmbt block. */
+
+static inline struct xfs_bmbt_rec *
+xfs_bmdr_rec_addr(
+ struct xfs_bmdr_block *block,
+ unsigned int index)
+{
+ return (struct xfs_bmbt_rec *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_bmbt_rec));
+}
+
+static inline struct xfs_bmbt_key *
+xfs_bmdr_key_addr(
+ struct xfs_bmdr_block *block,
+ unsigned int index)
+{
+ return (struct xfs_bmbt_key *)
+ ((char *)(block + 1) +
+ (index - 1) * sizeof(struct xfs_bmbt_key));
+}
+
+static inline xfs_bmbt_ptr_t *
+xfs_bmdr_ptr_addr(
+ struct xfs_bmdr_block *block,
+ unsigned int index,
+ unsigned int maxrecs)
+{
+ return (xfs_bmbt_ptr_t *)
+ ((char *)(block + 1) +
+ maxrecs * sizeof(struct xfs_bmbt_key) +
+ (index - 1) * sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Address of pointers within the incore btree root.
+ *
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+static inline xfs_bmbt_ptr_t *
+xfs_bmap_broot_ptr_addr(
+ struct xfs_mount *mp,
+ struct xfs_btree_block *bb,
+ unsigned int i,
+ unsigned int sz)
+{
+ return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, false));
+}
+
+/*
+ * Compute the space required for the incore btree root containing the given
+ * number of records.
+ */
+static inline size_t
+xfs_bmap_broot_space_calc(
+ struct xfs_mount *mp,
+ unsigned int nrecs)
+{
+ return xfs_bmbt_block_len(mp) +
+ (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
+}
+
+/*
+ * Compute the space required for the incore btree root given the ondisk
+ * btree root block.
+ */
+static inline size_t
+xfs_bmap_broot_space(
+ struct xfs_mount *mp,
+ struct xfs_bmdr_block *bb)
+{
+ return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs));
+}
+
+/* Compute the space required for the ondisk root block. */
+static inline size_t
+xfs_bmdr_space_calc(unsigned int nrecs)
+{
+ return sizeof(struct xfs_bmdr_block) +
+ (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t)));
+}
+
+/*
+ * Compute the space required for the ondisk root block given an incore root
+ * block.
+ */
+static inline size_t
+xfs_bmap_bmdr_space(struct xfs_btree_block *bb)
+{
+ return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs));
+}
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 40021849b42f..2cd212ad2c1d 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -28,7 +28,6 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
-#include "xfs_trans_priv.h"
#include "xfs_exchmaps.h"
static struct kmem_cache *xfs_defer_pending_cache;
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 454b63ef7201..860284064c5a 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -8,6 +8,7 @@
/*
* SGI's XFS filesystem's major stuff (constants, structures)
+ * NOTE: This file must be compile-able with C++ compilers.
*/
/*
@@ -826,6 +827,30 @@ struct xfs_exchange_range {
};
/*
+ * Using the same definition of file2 as struct xfs_exchange_range, commit the
+ * contents of file1 into file2 if file2 has the same inode number, mtime, and
+ * ctime as the arguments provided to the call. The old contents of file2 will
+ * be moved to file1.
+ *
+ * Returns -EBUSY if there isn't an exact match for the file2 fields.
+ *
+ * Filesystems must be able to restart and complete the operation even after
+ * the system goes down.
+ */
+struct xfs_commit_range {
+ __s32 file1_fd;
+ __u32 pad; /* must be zeroes */
+ __u64 file1_offset; /* file1 offset, bytes */
+ __u64 file2_offset; /* file2 offset, bytes */
+ __u64 length; /* bytes to exchange */
+
+ __u64 flags; /* see XFS_EXCHANGE_RANGE_* below */
+
+ /* opaque file2 metadata for freshness checks */
+ __u64 file2_freshness[6];
+};
+
+/*
* Exchange file data all the way to the ends of both files, and then exchange
* the file sizes. This flag can be used to replace a file's contents with a
* different amount of data. length will be ignored.
@@ -906,13 +931,13 @@ static inline struct xfs_getparents_rec *
xfs_getparents_next_rec(struct xfs_getparents *gp,
struct xfs_getparents_rec *gpr)
{
- void *next = ((void *)gpr + gpr->gpr_reclen);
+ void *next = ((char *)gpr + gpr->gpr_reclen);
void *end = (void *)(uintptr_t)(gp->gp_buffer + gp->gp_bufsize);
if (next >= end)
return NULL;
- return next;
+ return (struct xfs_getparents_rec *)next;
}
/* Iterate through this file handle's directory parent pointers. */
@@ -997,6 +1022,8 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range)
+#define XFS_IOC_START_COMMIT _IOR ('X', 130, struct xfs_commit_range)
+#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 0af5b7a33d05..20bb5ce38134 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1855,11 +1855,12 @@ out_release:
int
xfs_dialloc(
struct xfs_trans **tpp,
- xfs_ino_t parent,
- umode_t mode,
+ const struct xfs_icreate_args *args,
xfs_ino_t *new_ino)
{
struct xfs_mount *mp = (*tpp)->t_mountp;
+ xfs_ino_t parent = args->pip ? args->pip->i_ino : 0;
+ umode_t mode = args->mode & S_IFMT;
xfs_agnumber_t agno;
int error = 0;
xfs_agnumber_t start_agno;
@@ -2947,8 +2948,8 @@ xfs_ialloc_setup_geometry(
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
- igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
- igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+ igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true);
+ igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false);
igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index b549627e3a61..3a1323155a45 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -33,11 +33,13 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
}
+struct xfs_icreate_args;
+
/*
* Allocate an inode on disk. Mode is used to tell whether the new inode will
* need space, and whether it is a directory.
*/
-int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode,
+int xfs_dialloc(struct xfs_trans **tpp, const struct xfs_icreate_args *args,
xfs_ino_t *new_ino);
int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 496e2f72a85b..401b42d52af6 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -572,11 +572,11 @@ xfs_inobt_block_maxrecs(
/*
* Calculate number of records in an inobt btree block.
*/
-int
+unsigned int
xfs_inobt_maxrecs(
struct xfs_mount *mp,
- int blocklen,
- int leaf)
+ unsigned int blocklen,
+ bool leaf)
{
blocklen -= XFS_INOBT_BLOCK_LEN(mp);
return xfs_inobt_block_maxrecs(blocklen, leaf);
@@ -749,7 +749,7 @@ xfs_finobt_count_blocks(
if (error)
return error;
- cur = xfs_inobt_init_cursor(pag, tp, agbp);
+ cur = xfs_finobt_init_cursor(pag, tp, agbp);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 6472ec1ecbb4..300edf5bc009 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -50,7 +50,8 @@ struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
struct xfs_trans *tp, struct xfs_buf *agbp);
struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag,
struct xfs_trans *tp, struct xfs_buf *agbp);
-extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+unsigned int xfs_inobt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
/* ir_holemask to inode allocation bitmap conversion */
uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 513b50da6215..79babeac9d75 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -514,12 +514,18 @@ xfs_dinode_verify(
return __this_address;
}
- if (dip->di_version > 1) {
+ /*
+ * Historical note: xfsprogs in the 3.2 era set up its incore inodes to
+ * have di_nlink track the link count, even if the actual filesystem
+ * only supported V1 inodes (i.e. di_onlink). When writing out the
+ * ondisk inode, it would set both the ondisk di_nlink and di_onlink to
+ * the the incore di_nlink value, which is why we cannot check for
+ * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with
+ * di_onlink==0, so we can check that.
+ */
+ if (dip->di_version >= 2) {
if (dip->di_onlink)
return __this_address;
- } else {
- if (dip->di_nlink)
- return __this_address;
}
/* don't allow invalid i_size */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9d11ae015909..1158ca48626b 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -185,7 +185,7 @@ xfs_iformat_btree(
ifp = xfs_ifork_ptr(ip, whichfork);
dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
- size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+ size = xfs_bmap_broot_space(mp, dfp);
nrecs = be16_to_cpu(dfp->bb_numrecs);
level = be16_to_cpu(dfp->bb_level);
@@ -198,7 +198,7 @@ xfs_iformat_btree(
*/
if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) ||
nrecs == 0 ||
- XFS_BMDR_SPACE_CALC(nrecs) >
+ xfs_bmdr_space_calc(nrecs) >
XFS_DFORK_SIZE(dip, mp, whichfork) ||
ifp->if_nextents > ip->i_nblocks) ||
level == 0 || level > XFS_BM_MAXLEVELS(mp, whichfork)) {
@@ -409,7 +409,7 @@ xfs_iroot_realloc(
* allocate it now and get out.
*/
if (ifp->if_broot_bytes == 0) {
- new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+ new_size = xfs_bmap_broot_space_calc(mp, rec_diff);
ifp->if_broot = kmalloc(new_size,
GFP_KERNEL | __GFP_NOFAIL);
ifp->if_broot_bytes = (int)new_size;
@@ -422,17 +422,17 @@ xfs_iroot_realloc(
* location. The records don't change location because
* they are kept butted up against the btree block header.
*/
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
new_max = cur_max + rec_diff;
- new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+ new_size = xfs_bmap_broot_space_calc(mp, new_max);
ifp->if_broot = krealloc(ifp->if_broot, new_size,
GFP_KERNEL | __GFP_NOFAIL);
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
(int)new_size);
ifp->if_broot_bytes = (int)new_size;
- ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
xfs_inode_fork_size(ip, whichfork));
memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
return;
@@ -444,11 +444,11 @@ xfs_iroot_realloc(
* records, just get rid of the root and clear the status bit.
*/
ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
- cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false);
new_max = cur_max + rec_diff;
ASSERT(new_max >= 0);
if (new_max > 0)
- new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+ new_size = xfs_bmap_broot_space_calc(mp, new_max);
else
new_size = 0;
if (new_size > 0) {
@@ -457,28 +457,28 @@ xfs_iroot_realloc(
* First copy over the btree block header.
*/
memcpy(new_broot, ifp->if_broot,
- XFS_BMBT_BLOCK_LEN(ip->i_mount));
+ xfs_bmbt_block_len(ip->i_mount));
} else {
new_broot = NULL;
}
/*
- * Only copy the records and pointers if there are any.
+ * Only copy the keys and pointers if there are any.
*/
if (new_max > 0) {
/*
- * First copy the records.
+ * First copy the keys.
*/
- op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
- np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
- memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+ op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1);
+ np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1);
+ memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t));
/*
* Then copy the pointers.
*/
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+ np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1,
(int)new_size);
memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
}
@@ -486,7 +486,7 @@ xfs_iroot_realloc(
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
if (ifp->if_broot)
- ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
xfs_inode_fork_size(ip, whichfork));
return;
}
@@ -655,7 +655,7 @@ xfs_iflush_fork(
if ((iip->ili_fields & brootflag[whichfork]) &&
(ifp->if_broot_bytes > 0)) {
ASSERT(ifp->if_broot != NULL);
- ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+ ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <=
xfs_inode_fork_size(ip, whichfork));
xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 032333289113..cc38e1c3c3e1 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -308,7 +308,7 @@ xfs_inode_init(
!vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
inode->i_mode &= ~S_ISGID;
- ip->i_projid = pip ? xfs_get_initial_prid(pip) : 0;
+ ip->i_projid = xfs_get_initial_prid(pip);
}
ip->i_disk_size = 0;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index cb3b1d42ae9a..795928d1a66d 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -417,9 +417,10 @@ xfs_refcountbt_block_maxrecs(
/*
* Calculate the number of records in a refcount btree block.
*/
-int
+unsigned int
xfs_refcountbt_maxrecs(
- int blocklen,
+ struct xfs_mount *mp,
+ unsigned int blocklen,
bool leaf)
{
blocklen -= XFS_REFCOUNT_BLOCK_LEN;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index 1e0ab25f6c68..beb93bef6a81 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -48,7 +48,8 @@ struct xbtree_afakeroot;
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
struct xfs_perag *pag);
-extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
+unsigned int xfs_refcountbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 56fd6c4bd8b4..ac2f1f499b76 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -731,10 +731,11 @@ xfs_rmapbt_block_maxrecs(
/*
* Calculate number of records in an rmap btree block.
*/
-int
+unsigned int
xfs_rmapbt_maxrecs(
- int blocklen,
- int leaf)
+ struct xfs_mount *mp,
+ unsigned int blocklen,
+ bool leaf)
{
blocklen -= XFS_RMAP_BLOCK_LEN;
return xfs_rmapbt_block_maxrecs(blocklen, leaf);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index eb90d89e8086..119b1567cd0e 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -47,7 +47,8 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_perag *pag);
void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
struct xfs_trans *tp, struct xfs_buf *agbp);
-int xfs_rmapbt_maxrecs(int blocklen, int leaf);
+unsigned int xfs_rmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+ bool leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 386b672c5058..27a4472402ba 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -13,6 +13,8 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
@@ -69,7 +71,7 @@ xfs_rtbuf_cache_relse(
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
-int
+static int
xfs_rtbuf_get(
struct xfs_rtalloc_args *args,
xfs_fileoff_t block, /* block number in bitmap or summary */
@@ -138,15 +140,43 @@ xfs_rtbuf_get(
return 0;
}
+int
+xfs_rtbitmap_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ struct xfs_mount *mp = args->mp;
+
+ if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) {
+ xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_rtbuf_get(args, block, 0);
+}
+
+int
+xfs_rtsummary_read_buf(
+ struct xfs_rtalloc_args *args,
+ xfs_fileoff_t block)
+{
+ struct xfs_mount *mp = args->mp;
+
+ if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) {
+ xfs_rt_mark_sick(args->mp, XFS_SICK_RT_SUMMARY);
+ return -EFSCORRUPTED;
+ }
+ return xfs_rtbuf_get(args, block, 1);
+}
+
/*
- * Searching backward from start to limit, find the first block whose
- * allocated/free state is different from start's.
+ * Searching backward from start find the first block whose allocated/free state
+ * is different from start's.
*/
int
xfs_rtfind_back(
struct xfs_rtalloc_args *args,
xfs_rtxnum_t start, /* starting rtext to look at */
- xfs_rtxnum_t limit, /* last rtext to look at */
xfs_rtxnum_t *rtx) /* out: start rtext found */
{
struct xfs_mount *mp = args->mp;
@@ -175,7 +205,7 @@ xfs_rtfind_back(
*/
word = xfs_rtx_to_rbmword(mp, start);
bit = (int)(start & (XFS_NBWORD - 1));
- len = start - limit + 1;
+ len = start + 1;
/*
* Compute match value, based on the bit at start: if 1 (free)
* then all-ones, else all-zeroes.
@@ -316,6 +346,8 @@ xfs_rtfind_forw(
xfs_rtword_t incore;
unsigned int word; /* word number in the buffer */
+ ASSERT(start <= limit);
+
/*
* Compute and read in starting bitmap block for starting block.
*/
@@ -698,7 +730,7 @@ xfs_rtfree_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(args, start, 0, &preblock);
+ error = xfs_rtfind_back(args, start, &preblock);
if (error) {
return error;
}
@@ -990,25 +1022,24 @@ xfs_rtfree_blocks(
xfs_filblks_t rtlen)
{
struct xfs_mount *mp = tp->t_mountp;
- xfs_rtxnum_t start;
- xfs_filblks_t len;
xfs_extlen_t mod;
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
- len = xfs_rtb_to_rtxrem(mp, rtlen, &mod);
+ mod = xfs_rtb_to_rtxoff(mp, rtlen);
if (mod) {
ASSERT(mod == 0);
return -EIO;
}
- start = xfs_rtb_to_rtxrem(mp, rtbno, &mod);
+ mod = xfs_rtb_to_rtxoff(mp, rtbno);
if (mod) {
ASSERT(mod == 0);
return -EIO;
}
- return xfs_rtfree_extent(tp, start, len);
+ return xfs_rtfree_extent(tp, xfs_rtb_to_rtx(mp, rtbno),
+ xfs_rtb_to_rtx(mp, rtlen));
}
/* Find all the free records within a given range. */
@@ -1016,8 +1047,8 @@ int
xfs_rtalloc_query_range(
struct xfs_mount *mp,
struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *low_rec,
- const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtxnum_t start,
+ xfs_rtxnum_t end,
xfs_rtalloc_query_range_fn fn,
void *priv)
{
@@ -1025,45 +1056,42 @@ xfs_rtalloc_query_range(
.mp = mp,
.tp = tp,
};
- struct xfs_rtalloc_rec rec;
- xfs_rtxnum_t rtstart;
- xfs_rtxnum_t rtend;
- xfs_rtxnum_t high_key;
- int is_free;
int error = 0;
- if (low_rec->ar_startext > high_rec->ar_startext)
+ if (start > end)
return -EINVAL;
- if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
- low_rec->ar_startext == high_rec->ar_startext)
+ if (start == end || start >= mp->m_sb.sb_rextents)
return 0;
- high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1);
+ end = min(end, mp->m_sb.sb_rextents - 1);
/* Iterate the bitmap, looking for discrepancies. */
- rtstart = low_rec->ar_startext;
- while (rtstart <= high_key) {
+ while (start <= end) {
+ struct xfs_rtalloc_rec rec;
+ int is_free;
+ xfs_rtxnum_t rtend;
+
/* Is the first block free? */
- error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend,
+ error = xfs_rtcheck_range(&args, start, 1, 1, &rtend,
&is_free);
if (error)
break;
/* How long does the extent go for? */
- error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend);
+ error = xfs_rtfind_forw(&args, start, end, &rtend);
if (error)
break;
if (is_free) {
- rec.ar_startext = rtstart;
- rec.ar_extcount = rtend - rtstart + 1;
+ rec.ar_startext = start;
+ rec.ar_extcount = rtend - start + 1;
error = fn(mp, tp, &rec, priv);
if (error)
break;
}
- rtstart = rtend + 1;
+ start = rtend + 1;
}
xfs_rtbuf_cache_relse(&args);
@@ -1078,13 +1106,8 @@ xfs_rtalloc_query_all(
xfs_rtalloc_query_range_fn fn,
void *priv)
{
- struct xfs_rtalloc_rec keys[2];
-
- keys[0].ar_startext = 0;
- keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
- keys[0].ar_extcount = keys[1].ar_extcount = 0;
-
- return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
+ return xfs_rtalloc_query_range(mp, tp, 0, mp->m_sb.sb_rextents - 1, fn,
+ priv);
}
/* Is the given extent all free? */
@@ -1125,21 +1148,6 @@ xfs_rtbitmap_blockcount(
return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
}
-/*
- * Compute the number of rtbitmap words needed to populate every block of a
- * bitmap that is large enough to track the given number of rt extents.
- */
-unsigned long long
-xfs_rtbitmap_wordcount(
- struct xfs_mount *mp,
- xfs_rtbxlen_t rtextents)
-{
- xfs_filblks_t blocks;
-
- blocks = xfs_rtbitmap_blockcount(mp, rtextents);
- return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
-}
-
/* Compute the number of rtsummary blocks needed to track the given rt space. */
xfs_filblks_t
xfs_rtsummary_blockcount(
@@ -1153,39 +1161,25 @@ xfs_rtsummary_blockcount(
return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
}
-/*
- * Compute the number of rtsummary info words needed to populate every block of
- * a summary file that is large enough to track the given rt space.
- */
-unsigned long long
-xfs_rtsummary_wordcount(
- struct xfs_mount *mp,
- unsigned int rsumlevels,
- xfs_extlen_t rbmblocks)
+/* Lock both realtime free space metadata inodes for a freespace update. */
+void
+xfs_rtbitmap_lock(
+ struct xfs_mount *mp)
{
- xfs_filblks_t blocks;
-
- blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks);
- return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG;
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
}
/*
- * Lock both realtime free space metadata inodes for a freespace update. If a
- * transaction is given, the inodes will be joined to the transaction and the
+ * Join both realtime free space metadata inodes to the transaction. The
* ILOCKs will be released on transaction commit.
*/
void
-xfs_rtbitmap_lock(
- struct xfs_trans *tp,
- struct xfs_mount *mp)
+xfs_rtbitmap_trans_join(
+ struct xfs_trans *tp)
{
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
- if (tp)
- xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
- if (tp)
- xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tp->t_mountp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tp->t_mountp->m_rsumip, XFS_ILOCK_EXCL);
}
/* Unlock both realtime free space metadata inodes after a freespace update. */
@@ -1225,3 +1219,127 @@ xfs_rtbitmap_unlock_shared(
if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
}
+
+static int
+xfs_rtfile_alloc_blocks(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb,
+ struct xfs_bmbt_irec *map)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int nmap = 1;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc,
+ XFS_GROWFSRT_SPACE_RES(mp, count_fsb), 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_METADATA, 0, map, &nmap);
+ if (error)
+ goto out_trans_cancel;
+
+ return xfs_trans_commit(tp);
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+/* Get a buffer for the block. */
+static int
+xfs_rtfile_initialize_block(
+ struct xfs_inode *ip,
+ xfs_fsblock_t fsbno,
+ void *data)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_buf *bp;
+ const size_t copylen = mp->m_blockwsize << XFS_WORDLOG;
+ enum xfs_blft buf_type;
+ int error;
+
+ if (ip == mp->m_rsumip)
+ buf_type = XFS_BLFT_RTSUMMARY_BUF;
+ else
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp);
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ xfs_trans_buf_set_type(tp, bp, buf_type);
+ bp->b_ops = &xfs_rtbuf_ops;
+ if (data)
+ memcpy(bp->b_addr, data, copylen);
+ else
+ memset(bp->b_addr, 0, copylen);
+ xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+ return xfs_trans_commit(tp);
+}
+
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ * @data must be a contiguous buffer large enough to fill all blocks in the
+ * file; or NULL to initialize the contents to zeroes.
+ */
+int
+xfs_rtfile_initialize_blocks(
+ struct xfs_inode *ip, /* inode (bitmap/summary) */
+ xfs_fileoff_t offset_fsb, /* offset to start from */
+ xfs_fileoff_t end_fsb, /* offset to allocate to */
+ void *data) /* data to fill the blocks */
+{
+ struct xfs_mount *mp = ip->i_mount;
+ const size_t copylen = mp->m_blockwsize << XFS_WORDLOG;
+
+ while (offset_fsb < end_fsb) {
+ struct xfs_bmbt_irec map;
+ xfs_filblks_t i;
+ int error;
+
+ error = xfs_rtfile_alloc_blocks(ip, offset_fsb,
+ end_fsb - offset_fsb, &map);
+ if (error)
+ return error;
+
+ /*
+ * Now we need to clear the allocated blocks.
+ *
+ * Do this one block per transaction, to keep it simple.
+ */
+ for (i = 0; i < map.br_blockcount; i++) {
+ error = xfs_rtfile_initialize_block(ip,
+ map.br_startblock + i, data);
+ if (error)
+ return error;
+ if (data)
+ data += copylen;
+ }
+
+ offset_fsb = map.br_startoff + map.br_blockcount;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 6186585f2c37..140513d1d6bc 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -87,24 +87,6 @@ xfs_rtb_to_rtxoff(
}
/*
- * Crack an rt block number into an rt extent number and an offset within that
- * rt extent. Returns the rt extent number directly and the offset in @off.
- */
-static inline xfs_rtxnum_t
-xfs_rtb_to_rtxrem(
- struct xfs_mount *mp,
- xfs_rtblock_t rtbno,
- xfs_extlen_t *off)
-{
- if (likely(mp->m_rtxblklog >= 0)) {
- *off = rtbno & mp->m_rtxblkmask;
- return rtbno >> mp->m_rtxblklog;
- }
-
- return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off);
-}
-
-/*
* Convert an rt block number into an rt extent number, rounding up to the next
* rt extent if the rt block is not aligned to an rt extent boundary.
*/
@@ -293,30 +275,12 @@ typedef int (*xfs_rtalloc_query_range_fn)(
#ifdef CONFIG_XFS_RT
void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args);
-
-int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block,
- int issum);
-
-static inline int
-xfs_rtbitmap_read_buf(
- struct xfs_rtalloc_args *args,
- xfs_fileoff_t block)
-{
- return xfs_rtbuf_get(args, block, 0);
-}
-
-static inline int
-xfs_rtsummary_read_buf(
- struct xfs_rtalloc_args *args,
- xfs_fileoff_t block)
-{
- return xfs_rtbuf_get(args, block, 1);
-}
-
+int xfs_rtbitmap_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block);
+int xfs_rtsummary_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block);
int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat);
int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
- xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
+ xfs_rtxnum_t *rtblock);
int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
@@ -328,8 +292,7 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxlen_t len);
int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *low_rec,
- const struct xfs_rtalloc_rec *high_rec,
+ xfs_rtxnum_t start, xfs_rtxnum_t end,
xfs_rtalloc_query_range_fn fn, void *priv);
int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
@@ -353,16 +316,15 @@ int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
rtextents);
-unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
- xfs_rtbxlen_t rtextents);
-
xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
unsigned int rsumlevels, xfs_extlen_t rbmblocks);
-unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
- unsigned int rsumlevels, xfs_extlen_t rbmblocks);
-void xfs_rtbitmap_lock(struct xfs_trans *tp, struct xfs_mount *mp);
+int xfs_rtfile_initialize_blocks(struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data);
+
+void xfs_rtbitmap_lock(struct xfs_mount *mp);
void xfs_rtbitmap_unlock(struct xfs_mount *mp);
+void xfs_rtbitmap_trans_join(struct xfs_trans *tp);
/* Lock the rt bitmap inode in shared mode */
#define XFS_RBMLOCK_BITMAP (1U << 0)
@@ -388,10 +350,9 @@ xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
/* shut up gcc */
return 0;
}
-# define xfs_rtbitmap_wordcount(mp, r) (0)
# define xfs_rtsummary_blockcount(mp, l, b) (0)
-# define xfs_rtsummary_wordcount(mp, l, b) (0)
-# define xfs_rtbitmap_lock(tp, mp) do { } while (0)
+# define xfs_rtbitmap_lock(mp) do { } while (0)
+# define xfs_rtbitmap_trans_join(tp) do { } while (0)
# define xfs_rtbitmap_unlock(mp) do { } while (0)
# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0)
# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 6b56f0f6d4c1..d95409f3cba6 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -232,6 +232,38 @@ xfs_validate_sb_read(
return 0;
}
+static uint64_t
+xfs_sb_calc_rbmblocks(
+ struct xfs_sb *sbp)
+{
+ return howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize);
+}
+
+/* Validate the realtime geometry */
+bool
+xfs_validate_rt_geometry(
+ struct xfs_sb *sbp)
+{
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
+ return false;
+
+ if (sbp->sb_rblocks == 0) {
+ if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+ sbp->sb_rextslog != 0 || sbp->sb_frextents != 0)
+ return false;
+ return true;
+ }
+
+ if (sbp->sb_rextents == 0 ||
+ sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) ||
+ sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) ||
+ sbp->sb_rbmblocks != xfs_sb_calc_rbmblocks(sbp))
+ return false;
+
+ return true;
+}
+
/* Check all the superblock fields we care about when writing one out. */
STATIC int
xfs_validate_sb_write(
@@ -491,39 +523,13 @@ xfs_validate_sb_common(
}
}
- /* Validate the realtime geometry; stolen from xfs_repair */
- if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
- sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+ if (!xfs_validate_rt_geometry(sbp)) {
xfs_notice(mp,
- "realtime extent sanity check failed");
+ "realtime %sgeometry check failed",
+ sbp->sb_rblocks ? "" : "zeroed ");
return -EFSCORRUPTED;
}
- if (sbp->sb_rblocks == 0) {
- if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
- sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
- xfs_notice(mp,
- "realtime zeroed geometry check failed");
- return -EFSCORRUPTED;
- }
- } else {
- uint64_t rexts;
- uint64_t rbmblocks;
-
- rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
- rbmblocks = howmany_64(sbp->sb_rextents,
- NBBY * sbp->sb_blocksize);
-
- if (!xfs_validate_rtextents(rexts) ||
- sbp->sb_rextents != rexts ||
- sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
- sbp->sb_rbmblocks != rbmblocks) {
- xfs_notice(mp,
- "realtime geometry sanity check failed");
- return -EFSCORRUPTED;
- }
- }
-
/*
* Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign)
* would imply the image is corrupted.
@@ -959,6 +965,15 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.verify_write = xfs_sb_write_verify,
};
+void
+xfs_mount_sb_set_rextsize(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
+ mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
+}
+
/*
* xfs_mount_common
*
@@ -983,26 +998,25 @@ xfs_sb_mount_common(
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
- mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
- mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
+ xfs_mount_sb_set_rextsize(mp, sbp);
- mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true);
+ mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false);
mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
- mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, true);
+ mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, false);
mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
- mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1);
- mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0);
+ mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, true);
+ mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, false);
mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
- mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true);
- mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false);
+ mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true);
+ mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false);
mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 37b1ed1bc209..885c83755991 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -17,6 +17,8 @@ extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
extern int xfs_sync_sb_buf(struct xfs_mount *mp);
extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
+void xfs_mount_sb_set_rextsize(struct xfs_mount *mp,
+ struct xfs_sb *sbp);
extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
@@ -38,6 +40,7 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
__s64 sunit, __s64 swidth, int sectorsize, bool may_repair,
bool silent);
+bool xfs_validate_rt_geometry(struct xfs_sb *sbp);
uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 45aaf169806a..1a7f95bcf069 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -130,7 +130,7 @@ xfs_calc_inode_res(
(4 * sizeof(struct xlog_op_header) +
sizeof(struct xfs_inode_log_format) +
mp->m_sb.sb_inodesize +
- 2 * XFS_BMBT_BLOCK_LEN(mp));
+ 2 * xfs_bmbt_block_len(mp));
}
/*
@@ -918,7 +918,7 @@ xfs_calc_growrtfree_reservation(
return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
- xfs_calc_buf_res(1, mp->m_rsumsize);
+ xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, mp->m_rsumblocks));
}
/*
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 76eb9e328835..a8cd44d03ef6 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -235,16 +235,4 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
xfs_fileoff_t len);
-/* Do we support an rt volume having this number of rtextents? */
-static inline bool
-xfs_validate_rtextents(
- xfs_rtbxlen_t rtextents)
-{
- /* No runt rt volumes */
- if (rtextents == 0)
- return false;
-
- return true;
-}
-
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 24a15bf784f1..5ab2ac53c920 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -938,7 +938,13 @@ xchk_bmap(
}
break;
case XFS_ATTR_FORK:
- if (!xfs_has_attr(mp) && !xfs_has_attr2(mp))
+ /*
+ * "attr" means that an attr fork was created at some point in
+ * the life of this filesystem. "attr2" means that inodes have
+ * variable-sized data/attr fork areas. Hence we only check
+ * attr here.
+ */
+ if (!xfs_has_attr(mp))
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
break;
default:
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 1e656fab5e41..49dc38acc66b 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -480,7 +480,7 @@ xrep_bmap_iroot_size(
{
ASSERT(level > 0);
- return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+ return xfs_bmap_broot_space_calc(cur->bc_mp, nr_this_level);
}
/* Update the inode counters. */
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 3d5f1f6b4b7b..47148cc4a833 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -53,6 +53,11 @@ int xchk_checkpoint_log(struct xfs_mount *mp);
bool xchk_should_check_xref(struct xfs_scrub *sc, int *error,
struct xfs_btree_cur **curpp);
+static inline int xchk_setup_nothing(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
+
/* Setup functions */
int xchk_setup_agheader(struct xfs_scrub *sc);
int xchk_setup_fs(struct xfs_scrub *sc);
@@ -72,16 +77,8 @@ int xchk_setup_dirtree(struct xfs_scrub *sc);
int xchk_setup_rtbitmap(struct xfs_scrub *sc);
int xchk_setup_rtsummary(struct xfs_scrub *sc);
#else
-static inline int
-xchk_setup_rtbitmap(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
-static inline int
-xchk_setup_rtsummary(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_setup_rtbitmap xchk_setup_nothing
+# define xchk_setup_rtsummary xchk_setup_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -93,16 +90,8 @@ xchk_ino_dqattach(struct xfs_scrub *sc)
{
return 0;
}
-static inline int
-xchk_setup_quota(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
-static inline int
-xchk_setup_quotacheck(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_setup_quota xchk_setup_nothing
+# define xchk_setup_quotacheck xchk_setup_nothing
#endif
int xchk_setup_fscounters(struct xfs_scrub *sc);
int xchk_setup_nlinks(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index daf9f1ee7c2c..3e45b9b72312 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -846,7 +846,7 @@ xrep_dinode_bad_bmbt_fork(
nrecs = be16_to_cpu(dfp->bb_numrecs);
level = be16_to_cpu(dfp->bb_level);
- if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+ if (nrecs == 0 || xfs_bmdr_space_calc(nrecs) > dfork_size)
return true;
if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
return true;
@@ -858,12 +858,12 @@ xrep_dinode_bad_bmbt_fork(
xfs_fileoff_t fileoff;
xfs_fsblock_t fsbno;
- fkp = XFS_BMDR_KEY_ADDR(dfp, i);
+ fkp = xfs_bmdr_key_addr(dfp, i);
fileoff = be64_to_cpu(fkp->br_startoff);
if (!xfs_verify_fileoff(sc->mp, fileoff))
return true;
- fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
+ fpp = xfs_bmdr_ptr_addr(dfp, i, dmxr);
fsbno = be64_to_cpu(*fpp);
if (!xfs_verify_fsbno(sc->mp, fsbno))
return true;
@@ -1121,7 +1121,7 @@ xrep_dinode_ensure_forkoff(
struct xfs_bmdr_block *bmdr;
struct xfs_scrub *sc = ri->sc;
xfs_extnum_t attr_extents, data_extents;
- size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
+ size_t bmdr_minsz = xfs_bmdr_space_calc(1);
unsigned int lit_sz = XFS_LITINO(sc->mp);
unsigned int afork_min, dfork_min;
@@ -1173,7 +1173,7 @@ xrep_dinode_ensure_forkoff(
case XFS_DINODE_FMT_BTREE:
/* Must have space for btree header and key/pointers. */
bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
- afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ afork_min = xfs_bmap_broot_space(sc->mp, bmdr);
break;
default:
/* We should never see any other formats. */
@@ -1223,7 +1223,7 @@ xrep_dinode_ensure_forkoff(
case XFS_DINODE_FMT_BTREE:
/* Must have space for btree header and key/pointers. */
bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
- dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ dfork_min = xfs_bmap_broot_space(sc->mp, bmdr);
break;
default:
dfork_min = 0;
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 3fee603f5244..7c7366c98338 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -63,7 +63,8 @@ xchk_setup_rtsummary(
* us to avoid pinning kernel memory for this purpose.
*/
descr = xchk_xfile_descr(sc, "realtime summary file");
- error = xfile_create(descr, mp->m_rsumsize, &sc->xfile);
+ error = xfile_create(descr, XFS_FSB_TO_B(mp, mp->m_rsumblocks),
+ &sc->xfile);
kfree(descr);
if (error)
return error;
@@ -95,16 +96,14 @@ xchk_setup_rtsummary(
* volume. Hence it is safe to compute and check the geometry values.
*/
if (mp->m_sb.sb_rblocks) {
- xfs_filblks_t rsumblocks;
int rextslog;
rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
rextslog = xfs_compute_rextslog(rts->rextents);
rts->rsumlevels = rextslog + 1;
rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
- rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
+ rts->rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
rts->rbmblocks);
- rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
}
return 0;
}
@@ -316,7 +315,7 @@ xchk_rtsummary(
}
/* Is m_rsumsize correct? */
- if (mp->m_rsumsize != rts->rsumsize) {
+ if (mp->m_rsumblocks != rts->rsumblocks) {
xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
goto out_rbm;
}
@@ -332,7 +331,7 @@ xchk_rtsummary(
* growfsrt expands the summary file before updating sb_rextents, so
* the file can be larger than rsumsize.
*/
- if (mp->m_rsumip->i_disk_size < rts->rsumsize) {
+ if (mp->m_rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) {
xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
goto out_rbm;
}
diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h
index e1d50304d8d4..e44b04cb6e2d 100644
--- a/fs/xfs/scrub/rtsummary.h
+++ b/fs/xfs/scrub/rtsummary.h
@@ -14,7 +14,7 @@ struct xchk_rtsummary {
uint64_t rextents;
uint64_t rbmblocks;
- uint64_t rsumsize;
+ xfs_filblks_t rsumblocks;
unsigned int rsumlevels;
unsigned int resblks;
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
index d9e971c4c79f..7deeb948cb70 100644
--- a/fs/xfs/scrub/rtsummary_repair.c
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -56,7 +56,7 @@ xrep_setup_rtsummary(
* transaction (which we cannot drop because we cannot drop the
* rtsummary ILOCK) and cannot ask for more reservation.
*/
- blocks = XFS_B_TO_FSB(mp, mp->m_rsumsize);
+ blocks = mp->m_rsumblocks;
blocks += xfs_bmbt_calc_size(mp, blocks) * 2;
if (blocks > UINT_MAX)
return -EOPNOTSUPP;
@@ -100,7 +100,6 @@ xrep_rtsummary(
{
struct xchk_rtsummary *rts = sc->buf;
struct xfs_mount *mp = sc->mp;
- xfs_filblks_t rsumblocks;
int error;
/* We require the rmapbt to rebuild anything. */
@@ -131,10 +130,9 @@ xrep_rtsummary(
}
/* Make sure we have space allocated for the entire summary file. */
- rsumblocks = XFS_B_TO_FSB(mp, rts->rsumsize);
xfs_trans_ijoin(sc->tp, sc->ip, 0);
xfs_trans_ijoin(sc->tp, sc->tempip, 0);
- error = xrep_tempfile_prealloc(sc, 0, rsumblocks);
+ error = xrep_tempfile_prealloc(sc, 0, rts->rsumblocks);
if (error)
return error;
@@ -143,11 +141,11 @@ xrep_rtsummary(
return error;
/* Copy the rtsummary file that we generated. */
- error = xrep_tempfile_copyin(sc, 0, rsumblocks,
+ error = xrep_tempfile_copyin(sc, 0, rts->rsumblocks,
xrep_rtsummary_prep_buf, rts);
if (error)
return error;
- error = xrep_tempfile_set_isize(sc, rts->rsumsize);
+ error = xrep_tempfile_set_isize(sc, XFS_FSB_TO_B(mp, rts->rsumblocks));
if (error)
return error;
@@ -168,7 +166,7 @@ xrep_rtsummary(
memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks);
mp->m_rsumlevels = rts->rsumlevels;
- mp->m_rsumsize = rts->rsumsize;
+ mp->m_rsumblocks = rts->rsumblocks;
/* Free the old rtsummary blocks if they're not in use. */
return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1bc33f010d0e..5993fcaffb2c 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -231,6 +231,11 @@ xchk_should_terminate(
return false;
}
+static inline int xchk_nothing(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
+
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
@@ -254,31 +259,15 @@ int xchk_dirtree(struct xfs_scrub *sc);
int xchk_rtbitmap(struct xfs_scrub *sc);
int xchk_rtsummary(struct xfs_scrub *sc);
#else
-static inline int
-xchk_rtbitmap(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
-static inline int
-xchk_rtsummary(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_rtbitmap xchk_nothing
+# define xchk_rtsummary xchk_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
int xchk_quotacheck(struct xfs_scrub *sc);
#else
-static inline int
-xchk_quota(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
-static inline int
-xchk_quotacheck(struct xfs_scrub *sc)
-{
- return -ENOENT;
-}
+# define xchk_quota xchk_nothing
+# define xchk_quotacheck xchk_nothing
#endif
int xchk_fscounters(struct xfs_scrub *sc);
int xchk_nlinks(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index d390d56cd875..177f922acfaf 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -88,7 +88,7 @@ xrep_tempfile_create(
goto out_release_dquots;
/* Allocate inode, set up directory. */
- error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ error = xfs_dialloc(&tp, &args, &ino);
if (error)
goto out_trans_cancel;
error = xfs_icreate(tp, ino, &args, &sc->tempip);
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index d848222f802b..9b5d98fe1f8a 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -293,7 +293,7 @@ xfile_get_folio(
* (potentially last) reference in xfile_put_folio.
*/
if (flags & XFILE_ALLOC)
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
return folio;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e224b49b7cff..35a8c1b8b3cb 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -346,6 +346,17 @@ xfs_bmap_defer_add(
trace_xfs_bmap_defer(bi);
xfs_bmap_update_get_group(tp->t_mountp, bi);
+
+ /*
+ * Ensure the deferred mapping is pre-recorded in i_delayed_blks.
+ *
+ * Otherwise stat can report zero blocks for an inode that actually has
+ * data when the entire mapping is in the process of being overwritten
+ * using the out of place write path. This is undone in xfs_bmapi_remap
+ * after it has incremented di_nblocks for a successful operation.
+ */
+ if (bi->bi_type == XFS_BMAP_MAP)
+ bi->bi_owner->i_delayed_blks += bi->bi_bmap.br_blockcount;
xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
}
@@ -367,6 +378,9 @@ xfs_bmap_update_cancel_item(
{
struct xfs_bmap_intent *bi = bi_entry(item);
+ if (bi->bi_type == XFS_BMAP_MAP)
+ bi->bi_owner->i_delayed_blks -= bi->bi_bmap.br_blockcount;
+
xfs_bmap_update_put_group(bi);
kmem_cache_free(xfs_bmap_intent_cache, bi);
}
@@ -464,6 +478,9 @@ xfs_bui_recover_work(
bi->bi_owner = *ipp;
xfs_bmap_update_get_group(mp, bi);
+ /* see xfs_bmap_defer_add for details */
+ if (bi->bi_type == XFS_BMAP_MAP)
+ bi->bi_owner->i_delayed_blks += bi->bi_bmap.br_blockcount;
xfs_defer_add_item(dfp, &bi->bi_list);
return bi;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fe2e2c930975..053d567c9108 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -331,8 +331,7 @@ xfs_getbmap(
}
if (xfs_get_extsz_hint(ip) ||
- (ip->i_diflags &
- (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
+ (ip->i_diflags & XFS_DIFLAG_PREALLOC))
max_len = mp->m_super->s_maxbytes;
else
max_len = XFS_ISIZE(ip);
@@ -492,12 +491,12 @@ bool
xfs_can_free_eofblocks(
struct xfs_inode *ip)
{
- struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
+ bool found_blocks = false;
xfs_fileoff_t end_fsb;
xfs_fileoff_t last_fsb;
- int nimaps = 1;
- int error;
+ struct xfs_bmbt_irec imap;
+ struct xfs_iext_cursor icur;
/*
* Caller must either hold the exclusive io lock; or be inactivating
@@ -524,12 +523,11 @@ xfs_can_free_eofblocks(
return false;
/*
- * Only free real extents for inodes with persistent preallocations or
- * the append-only flag.
+ * Do not free real extents in preallocated files unless the file has
+ * delalloc blocks and we are forced to remove them.
*/
- if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
- if (ip->i_delayed_blks == 0)
- return false;
+ if ((ip->i_diflags & XFS_DIFLAG_PREALLOC) && !ip->i_delayed_blks)
+ return false;
/*
* Do not try to free post-EOF blocks if EOF is beyond the end of the
@@ -544,21 +542,13 @@ xfs_can_free_eofblocks(
return false;
/*
- * Look up the mapping for the first block past EOF. If we can't find
- * it, there's nothing to free.
+ * Check if there is an post-EOF extent to free.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_bmapi_read(ip, end_fsb, last_fsb - end_fsb, &imap, &nimaps,
- 0);
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap))
+ found_blocks = true;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (error || nimaps == 0)
- return false;
-
- /*
- * If there's a real mapping there or there are delayed allocation
- * reservations, then we have post-EOF blocks to try to free.
- */
- return imap.br_startblock != HOLESTARTBLOCK || ip->i_delayed_blks;
+ return found_blocks;
}
/*
@@ -653,6 +643,9 @@ xfs_alloc_file_space(
xfs_bmbt_irec_t imaps[1], *imapp;
int error;
+ if (xfs_is_always_cow_inode(ip))
+ return 0;
+
trace_xfs_alloc_file_space(ip);
if (xfs_is_shutdown(mp))
@@ -848,6 +841,14 @@ xfs_free_file_space(
if (len <= 0) /* if nothing being freed */
return 0;
+ /*
+ * Now AIO and DIO has drained we flush and (if necessary) invalidate
+ * the cached range over the first operation we are about to run.
+ */
+ error = xfs_flush_unmap_range(ip, offset, len);
+ if (error)
+ return error;
+
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
@@ -1184,7 +1185,7 @@ xfs_swap_extents_check_format(
*/
if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
if (xfs_inode_has_attr_fork(ip) &&
- XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip))
+ xfs_bmap_bmdr_space(tifp->if_broot) > xfs_inode_fork_boff(ip))
return -EINVAL;
if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return -EINVAL;
@@ -1193,7 +1194,7 @@ xfs_swap_extents_check_format(
/* Reciprocal target->temp btree format checks */
if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
if (xfs_inode_has_attr_fork(tip) &&
- XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
+ xfs_bmap_bmdr_space(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
return -EINVAL;
if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return -EINVAL;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b1580644501f..209a389f2abc 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -210,7 +210,7 @@ struct xfs_buf {
* success the write is considered to be failed permanently and the
* iodone handler will take appropriate action.
*
- * For retry timeouts, we record the jiffie of the first failure. This
+ * For retry timeouts, we record the jiffy of the first failure. This
* means that we can change the retry timeout for buffers already under
* I/O and thus avoid getting stuck in a retry loop with a long timeout.
*
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 6f0fc7fe1f2b..d8c4a5dcca7a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -158,8 +158,7 @@ static int
xfs_trim_gather_extents(
struct xfs_perag *pag,
struct xfs_trim_cur *tcur,
- struct xfs_busy_extents *extents,
- uint64_t *blocks_trimmed)
+ struct xfs_busy_extents *extents)
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_trans *tp;
@@ -280,7 +279,6 @@ xfs_trim_gather_extents(
xfs_extent_busy_insert_discard(pag, fbno, flen,
&extents->extent_list);
- *blocks_trimmed += flen;
next_extent:
if (tcur->by_bno)
error = xfs_btree_increment(cur, 0, &i);
@@ -327,8 +325,7 @@ xfs_trim_perag_extents(
struct xfs_perag *pag,
xfs_agblock_t start,
xfs_agblock_t end,
- xfs_extlen_t minlen,
- uint64_t *blocks_trimmed)
+ xfs_extlen_t minlen)
{
struct xfs_trim_cur tcur = {
.start = start,
@@ -354,8 +351,7 @@ xfs_trim_perag_extents(
extents->owner = extents;
INIT_LIST_HEAD(&extents->extent_list);
- error = xfs_trim_gather_extents(pag, &tcur, extents,
- blocks_trimmed);
+ error = xfs_trim_gather_extents(pag, &tcur, extents);
if (error) {
kfree(extents);
break;
@@ -389,8 +385,7 @@ xfs_trim_datadev_extents(
struct xfs_mount *mp,
xfs_daddr_t start,
xfs_daddr_t end,
- xfs_extlen_t minlen,
- uint64_t *blocks_trimmed)
+ xfs_extlen_t minlen)
{
xfs_agnumber_t start_agno, end_agno;
xfs_agblock_t start_agbno, end_agbno;
@@ -411,8 +406,7 @@ xfs_trim_datadev_extents(
if (start_agno == end_agno)
agend = end_agbno;
- error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen,
- blocks_trimmed);
+ error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
if (error)
last_error = error;
@@ -431,9 +425,6 @@ struct xfs_trim_rtdev {
/* list of rt extents to free */
struct list_head extent_list;
- /* pointer to count of blocks trimmed */
- uint64_t *blocks_trimmed;
-
/* minimum length that caller allows us to trim */
xfs_rtblock_t minlen_fsb;
@@ -551,7 +542,6 @@ xfs_trim_gather_rtextent(
busyp->length = rlen;
INIT_LIST_HEAD(&busyp->list);
list_add_tail(&busyp->list, &tr->extent_list);
- *tr->blocks_trimmed += rlen;
tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
return 0;
@@ -562,15 +552,12 @@ xfs_trim_rtdev_extents(
struct xfs_mount *mp,
xfs_daddr_t start,
xfs_daddr_t end,
- xfs_daddr_t minlen,
- uint64_t *blocks_trimmed)
+ xfs_daddr_t minlen)
{
- struct xfs_rtalloc_rec low = { };
- struct xfs_rtalloc_rec high = { };
struct xfs_trim_rtdev tr = {
- .blocks_trimmed = blocks_trimmed,
.minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
};
+ xfs_rtxnum_t low, high;
struct xfs_trans *tp;
xfs_daddr_t rtdev_daddr;
int error;
@@ -596,17 +583,17 @@ xfs_trim_rtdev_extents(
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1);
/* Convert the rt blocks to rt extents */
- low.ar_startext = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start));
- high.ar_startext = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end));
+ low = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start));
+ high = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end));
/*
* Walk the free ranges between low and high. The query_range function
* trims the extents returned.
*/
do {
- tr.stop_rtx = low.ar_startext + (mp->m_sb.sb_blocksize * NBBY);
+ tr.stop_rtx = low + (mp->m_sb.sb_blocksize * NBBY);
xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
- error = xfs_rtalloc_query_range(mp, tp, &low, &high,
+ error = xfs_rtalloc_query_range(mp, tp, low, high,
xfs_trim_gather_rtextent, &tr);
if (error == -ECANCELED)
@@ -627,14 +614,14 @@ xfs_trim_rtdev_extents(
if (error)
break;
- low.ar_startext = tr.restart_rtx;
- } while (!xfs_trim_should_stop() && low.ar_startext <= high.ar_startext);
+ low = tr.restart_rtx;
+ } while (!xfs_trim_should_stop() && low <= high);
xfs_trans_cancel(tp);
return error;
}
#else
-# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP)
+# define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP)
#endif /* CONFIG_XFS_RT */
/*
@@ -661,7 +648,6 @@ xfs_ioc_trim(
xfs_daddr_t start, end;
xfs_extlen_t minlen;
xfs_rfsblock_t max_blocks;
- uint64_t blocks_trimmed = 0;
int error, last_error = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -706,15 +692,13 @@ xfs_ioc_trim(
end = start + BTOBBT(range.len) - 1;
if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
- error = xfs_trim_datadev_extents(mp, start, end, minlen,
- &blocks_trimmed);
+ error = xfs_trim_datadev_extents(mp, start, end, minlen);
if (error)
last_error = error;
}
if (rt_bdev && !xfs_trim_should_stop()) {
- error = xfs_trim_rtdev_extents(mp, start, end, minlen,
- &blocks_trimmed);
+ error = xfs_trim_rtdev_extents(mp, start, end, minlen);
if (error)
last_error = error;
}
@@ -722,7 +706,8 @@ xfs_ioc_trim(
if (last_error)
return last_error;
- range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+ range.len = min_t(unsigned long long, range.len,
+ XFS_FSB_TO_B(mp, max_blocks) - range.start);
if (copy_to_user(urange, &range, sizeof(range)))
return -EFAULT;
return 0;
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index c8a655c92c92..d0889190ab7f 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -72,6 +72,34 @@ xfs_exchrange_estimate(
return error;
}
+/*
+ * Check that file2's metadata agree with the snapshot that we took for the
+ * range commit request.
+ *
+ * This should be called after the filesystem has locked /all/ inode metadata
+ * against modification.
+ */
+STATIC int
+xfs_exchrange_check_freshness(
+ const struct xfs_exchrange *fxr,
+ struct xfs_inode *ip2)
+{
+ struct inode *inode2 = VFS_I(ip2);
+ struct timespec64 ctime = inode_get_ctime(inode2);
+ struct timespec64 mtime = inode_get_mtime(inode2);
+
+ trace_xfs_exchrange_freshness(fxr, ip2);
+
+ /* Check that file2 hasn't otherwise been modified. */
+ if (fxr->file2_ino != ip2->i_ino ||
+ fxr->file2_gen != inode2->i_generation ||
+ !timespec64_equal(&fxr->file2_ctime, &ctime) ||
+ !timespec64_equal(&fxr->file2_mtime, &mtime))
+ return -EBUSY;
+
+ return 0;
+}
+
#define QRETRY_IP1 (0x1)
#define QRETRY_IP2 (0x2)
@@ -607,6 +635,12 @@ xfs_exchrange_prep(
if (error || fxr->length == 0)
return error;
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
+ error = xfs_exchrange_check_freshness(fxr, ip2);
+ if (error)
+ return error;
+ }
+
/* Attach dquots to both inodes before changing block maps. */
error = xfs_qm_dqattach(ip2);
if (error)
@@ -719,7 +753,8 @@ xfs_exchange_range(
if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
return -EXDEV;
- if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+ __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
return -EINVAL;
/* Userspace requests only honored for regular files. */
@@ -802,3 +837,109 @@ xfs_ioc_exchange_range(
fdput(file1);
return error;
}
+
+/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
+struct xfs_commit_range_fresh {
+ xfs_fsid_t fsid; /* m_fixedfsid */
+ __u64 file2_ino; /* inode number */
+ __s64 file2_mtime; /* modification time */
+ __s64 file2_ctime; /* change time */
+ __s32 file2_mtime_nsec; /* mod time, nsec */
+ __s32 file2_ctime_nsec; /* change time, nsec */
+ __u32 file2_gen; /* inode generation */
+ __u32 magic; /* zero */
+};
+#define XCR_FRESH_MAGIC 0x444F524B /* DORK */
+
+/* Set up a commitrange operation by sampling file2's write-related attrs */
+long
+xfs_ioc_start_commit(
+ struct file *file,
+ struct xfs_commit_range __user *argp)
+{
+ struct xfs_commit_range args = { };
+ struct timespec64 ts;
+ struct xfs_commit_range_fresh *kern_f;
+ struct xfs_commit_range_fresh __user *user_f;
+ struct inode *inode2 = file_inode(file);
+ struct xfs_inode *ip2 = XFS_I(inode2);
+ const unsigned int lockflags = XFS_IOLOCK_SHARED |
+ XFS_MMAPLOCK_SHARED |
+ XFS_ILOCK_SHARED;
+
+ BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
+ sizeof(args.file2_freshness));
+
+ kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+ memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+ xfs_ilock(ip2, lockflags);
+ ts = inode_get_ctime(inode2);
+ kern_f->file2_ctime = ts.tv_sec;
+ kern_f->file2_ctime_nsec = ts.tv_nsec;
+ ts = inode_get_mtime(inode2);
+ kern_f->file2_mtime = ts.tv_sec;
+ kern_f->file2_mtime_nsec = ts.tv_nsec;
+ kern_f->file2_ino = ip2->i_ino;
+ kern_f->file2_gen = inode2->i_generation;
+ kern_f->magic = XCR_FRESH_MAGIC;
+ xfs_iunlock(ip2, lockflags);
+
+ user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
+ if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Exchange file1 and file2 contents if file2 has not been written since the
+ * start commit operation.
+ */
+long
+xfs_ioc_commit_range(
+ struct file *file,
+ struct xfs_commit_range __user *argp)
+{
+ struct xfs_exchrange fxr = {
+ .file2 = file,
+ };
+ struct xfs_commit_range args;
+ struct xfs_commit_range_fresh *kern_f;
+ struct xfs_inode *ip2 = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip2->i_mount;
+ struct fd file1;
+ int error;
+
+ kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ return -EINVAL;
+ if (kern_f->magic != XCR_FRESH_MAGIC)
+ return -EBUSY;
+ if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
+ return -EBUSY;
+
+ fxr.file1_offset = args.file1_offset;
+ fxr.file2_offset = args.file2_offset;
+ fxr.length = args.length;
+ fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
+ fxr.file2_ino = kern_f->file2_ino;
+ fxr.file2_gen = kern_f->file2_gen;
+ fxr.file2_mtime.tv_sec = kern_f->file2_mtime;
+ fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
+ fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
+ fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
+
+ file1 = fdget(args.file1_fd);
+ if (!file1.file)
+ return -EBADF;
+ fxr.file1 = file1.file;
+
+ error = xfs_exchange_range(&fxr);
+ fdput(file1);
+ return error;
+}
diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h
index 039abcca546e..bc1298aba806 100644
--- a/fs/xfs/xfs_exchrange.h
+++ b/fs/xfs/xfs_exchrange.h
@@ -10,8 +10,12 @@
#define __XFS_EXCHANGE_RANGE_UPD_CMTIME1 (1ULL << 63)
#define __XFS_EXCHANGE_RANGE_UPD_CMTIME2 (1ULL << 62)
+/* Freshness check required */
+#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2 (1ULL << 61)
+
#define XFS_EXCHANGE_RANGE_PRIV_FLAGS (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
- __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+ __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \
+ __XFS_EXCHANGE_RANGE_CHECK_FRESH2)
struct xfs_exchrange {
struct file *file1;
@@ -22,10 +26,20 @@ struct xfs_exchrange {
u64 length;
u64 flags; /* XFS_EXCHANGE_RANGE flags */
+
+ /* file2 metadata for freshness checks */
+ u64 file2_ino;
+ struct timespec64 file2_mtime;
+ struct timespec64 file2_ctime;
+ u32 file2_gen;
};
long xfs_ioc_exchange_range(struct file *file,
struct xfs_exchange_range __user *argp);
+long xfs_ioc_start_commit(struct file *file,
+ struct xfs_commit_range __user *argp);
+long xfs_ioc_commit_range(struct file *file,
+ struct xfs_commit_range __user *argp);
struct xfs_exchmaps_req;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4cdc54dc9686..e97d789495a5 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -852,6 +852,192 @@ static inline bool xfs_file_sync_writes(struct file *filp)
return false;
}
+static int
+xfs_falloc_newsize(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ loff_t *new_size)
+{
+ struct inode *inode = file_inode(file);
+
+ if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode))
+ return 0;
+ *new_size = offset + len;
+ return inode_newsize_ok(inode, *new_size);
+}
+
+static int
+xfs_falloc_setsize(
+ struct file *file,
+ loff_t new_size)
+{
+ struct iattr iattr = {
+ .ia_valid = ATTR_SIZE,
+ .ia_size = new_size,
+ };
+
+ if (!new_size)
+ return 0;
+ return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file),
+ &iattr);
+}
+
+static int
+xfs_falloc_collapse_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = i_size_read(inode) - len;
+ int error;
+
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case it
+ * is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode))
+ return -EINVAL;
+
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_insert_range(
+ struct file *file,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t isize = i_size_read(inode);
+ int error;
+
+ if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len))
+ return -EINVAL;
+
+ /*
+ * New inode size must not exceed ->s_maxbytes, accounting for
+ * possible signed overflow.
+ */
+ if (inode->i_sb->s_maxbytes - isize < len)
+ return -EFBIG;
+
+ /* Offset should be less than i_size */
+ if (offset >= isize)
+ return -EINVAL;
+
+ error = xfs_falloc_setsize(file, isize + len);
+ if (error)
+ return error;
+
+ /*
+ * Perform hole insertion now that the file size has been updated so
+ * that if we crash during the operation we don't leave shifted extents
+ * past EOF and hence losing access to the data that is contained within
+ * them.
+ */
+ return xfs_insert_file_space(XFS_I(inode), offset, len);
+}
+
+/*
+ * Punch a hole and prealloc the range. We use a hole punch rather than
+ * unwritten extent conversion for two reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by
+ * virtue of the hole punch.
+ */
+static int
+xfs_falloc_zero_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ unsigned int blksize = i_blocksize(inode);
+ loff_t new_size = 0;
+ int error;
+
+ trace_xfs_zero_file_space(XFS_I(inode));
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_free_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+
+ len = round_up(offset + len, blksize) - round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_unshare_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_reflink_unshare(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
+static int
+xfs_falloc_allocate_range(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ loff_t new_size = 0;
+ int error;
+
+ /*
+ * If always_cow mode we can't use preallocations and thus should not
+ * create them.
+ */
+ if (xfs_is_always_cow_inode(XFS_I(inode)))
+ return -EOPNOTSUPP;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ if (error)
+ return error;
+ return xfs_falloc_setsize(file, new_size);
+}
+
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
@@ -868,8 +1054,6 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- loff_t new_size = 0;
- bool do_file_insert = false;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
@@ -890,156 +1074,35 @@ xfs_file_fallocate(
*/
inode_dio_wait(inode);
- /*
- * Now AIO and DIO has drained we flush and (if necessary) invalidate
- * the cached range over the first operation we are about to run.
- *
- * We care about zero and collapse here because they both run a hole
- * punch over the range first. Because that can zero data, and the range
- * of invalidation for the shift operations is much larger, we still do
- * the required flush for collapse in xfs_prepare_shift().
- *
- * Insert has the same range requirements as collapse, and we extend the
- * file first which can zero data. Hence insert has the same
- * flush/invalidate requirements as collapse and so they are both
- * handled at the right time by xfs_prepare_shift().
- */
- if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_COLLAPSE_RANGE)) {
- error = xfs_flush_unmap_range(ip, offset, len);
- if (error)
- goto out_unlock;
- }
-
error = file_modified(file);
if (error)
goto out_unlock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- if (!xfs_is_falloc_aligned(ip, offset, len)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- /*
- * There is no need to overlap collapse range with EOF,
- * in which case it is effectively a truncate operation
- */
- if (offset + len >= i_size_read(inode)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- new_size = i_size_read(inode) - len;
-
- error = xfs_collapse_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- } else if (mode & FALLOC_FL_INSERT_RANGE) {
- loff_t isize = i_size_read(inode);
-
- if (!xfs_is_falloc_aligned(ip, offset, len)) {
- error = -EINVAL;
- goto out_unlock;
- }
-
- /*
- * New inode size must not exceed ->s_maxbytes, accounting for
- * possible signed overflow.
- */
- if (inode->i_sb->s_maxbytes - isize < len) {
- error = -EFBIG;
- goto out_unlock;
- }
- new_size = isize + len;
-
- /* Offset should be less than i_size */
- if (offset >= isize) {
- error = -EINVAL;
- goto out_unlock;
- }
- do_file_insert = true;
- } else {
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
- if (error)
- goto out_unlock;
- }
-
- if (mode & FALLOC_FL_ZERO_RANGE) {
- /*
- * Punch a hole and prealloc the range. We use a hole
- * punch rather than unwritten extent conversion for two
- * reasons:
- *
- * 1.) Hole punch handles partial block zeroing for us.
- * 2.) If prealloc returns ENOSPC, the file range is
- * still zero-valued by virtue of the hole punch.
- */
- unsigned int blksize = i_blocksize(inode);
-
- trace_xfs_zero_file_space(ip);
-
- error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
-
- len = round_up(offset + len, blksize) -
- round_down(offset, blksize);
- offset = round_down(offset, blksize);
- } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
- } else {
- /*
- * If always_cow mode we can't use preallocations and
- * thus should not create them.
- */
- if (xfs_is_always_cow_inode(ip)) {
- error = -EOPNOTSUPP;
- goto out_unlock;
- }
- }
-
- if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
- }
- }
-
- /* Change file size if needed */
- if (new_size) {
- struct iattr iattr;
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = new_size;
- error = xfs_vn_setattr_size(file_mnt_idmap(file),
- file_dentry(file), &iattr);
- if (error)
- goto out_unlock;
- }
-
- /*
- * Perform hole insertion now that the file size has been
- * updated so that if we crash during the operation we don't
- * leave shifted extents past EOF and hence losing access to
- * the data that is contained within them.
- */
- if (do_file_insert) {
- error = xfs_insert_file_space(ip, offset, len);
- if (error)
- goto out_unlock;
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ error = xfs_falloc_collapse_range(file, offset, len);
+ break;
+ case FALLOC_FL_INSERT_RANGE:
+ error = xfs_falloc_insert_range(file, offset, len);
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ error = xfs_falloc_zero_range(file, mode, offset, len);
+ break;
+ case FALLOC_FL_UNSHARE_RANGE:
+ error = xfs_falloc_unshare_range(file, mode, offset, len);
+ break;
+ case FALLOC_FL_ALLOCATE_RANGE:
+ error = xfs_falloc_allocate_range(file, mode, offset, len);
+ break;
+ default:
+ error = -EOPNOTSUPP;
+ break;
}
- if (xfs_file_sync_writes(file))
+ if (!error && xfs_file_sync_writes(file))
error = xfs_log_force_inode(ip);
out_unlock:
@@ -1175,12 +1238,78 @@ xfs_dir_open(
return error;
}
+/*
+ * Don't bother propagating errors. We're just doing cleanup, and the caller
+ * ignores the return value anyway.
+ */
STATIC int
xfs_file_release(
- struct inode *inode,
- struct file *filp)
+ struct inode *inode,
+ struct file *file)
{
- return xfs_release(XFS_I(inode));
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If this is a read-only mount or the file system has been shut down,
+ * don't generate I/O.
+ */
+ if (xfs_is_readonly(mp) || xfs_is_shutdown(mp))
+ return 0;
+
+ /*
+ * If we previously truncated this file and removed old data in the
+ * process, we want to initiate "early" writeout on the last close.
+ * This is an attempt to combat the notorious NULL files problem which
+ * is particularly noticeable from a truncate down, buffered (re-)write
+ * (delalloc), followed by a crash. What we are effectively doing here
+ * is significantly reducing the time window where we'd otherwise be
+ * exposed to that problem.
+ */
+ if (xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED)) {
+ xfs_iflags_clear(ip, XFS_EOFBLOCKS_RELEASED);
+ if (ip->i_delayed_blks > 0)
+ filemap_flush(inode->i_mapping);
+ }
+
+ /*
+ * XFS aggressively preallocates post-EOF space to generate contiguous
+ * allocations for writers that append to the end of the file.
+ *
+ * To support workloads that close and reopen the file frequently, these
+ * preallocations usually persist after a close unless it is the first
+ * close for the inode. This is a tradeoff to generate tightly packed
+ * data layouts for unpacking tarballs or similar archives that write
+ * one file after another without going back to it while keeping the
+ * preallocation for files that have recurring open/write/close cycles.
+ *
+ * This heuristic is skipped for inodes with the append-only flag as
+ * that flag is rather pointless for inodes written only once.
+ *
+ * There is no point in freeing blocks here for open but unlinked files
+ * as they will be taken care of by the inactivation path soon.
+ *
+ * When releasing a read-only context, don't flush data or trim post-EOF
+ * blocks. This avoids open/read/close workloads from removing EOF
+ * blocks that other writers depend upon to reduce fragmentation.
+ *
+ * If we can't get the iolock just skip truncating the blocks past EOF
+ * because we could deadlock with the mmap_lock otherwise. We'll get
+ * another chance to drop them once the last reference to the inode is
+ * dropped, so we'll never leak blocks permanently.
+ */
+ if (inode->i_nlink &&
+ (file->f_mode & FMODE_WRITE) &&
+ !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
+ !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ if (xfs_can_free_eofblocks(ip) &&
+ !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
+ xfs_free_eofblocks(ip);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+
+ return 0;
}
STATIC int
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 85dbb46452ca..ae18ab86e608 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -44,7 +44,7 @@ xfs_fsmap_from_internal(
}
/* Convert an fsmap to an xfs_fsmap. */
-void
+static void
xfs_fsmap_to_internal(
struct xfs_fsmap *dest,
struct fsmap *src)
@@ -71,7 +71,7 @@ xfs_fsmap_owner_to_rmap(
switch (src->fmr_owner) {
case 0: /* "lowest owner id possible" */
case -1ULL: /* "highest owner id possible" */
- dest->rm_owner = 0;
+ dest->rm_owner = src->fmr_owner;
break;
case XFS_FMR_OWN_FREE:
dest->rm_owner = XFS_RMAP_OWN_NULL;
@@ -162,6 +162,7 @@ struct xfs_getfsmap_info {
xfs_daddr_t next_daddr; /* next daddr we expect */
/* daddr of low fsmap key when we're using the rtbitmap */
xfs_daddr_t low_daddr;
+ xfs_daddr_t end_daddr; /* daddr of high fsmap key */
u64 missing_owner; /* owner of holes */
u32 dev; /* device id */
/*
@@ -182,6 +183,7 @@ struct xfs_getfsmap_dev {
int (*fn)(struct xfs_trans *tp,
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info);
+ sector_t nr_sectors;
};
/* Compare two getfsmap device handlers. */
@@ -252,7 +254,7 @@ xfs_getfsmap_rec_before_start(
const struct xfs_rmap_irec *rec,
xfs_daddr_t rec_daddr)
{
- if (info->low_daddr != -1ULL)
+ if (info->low_daddr != XFS_BUF_DADDR_NULL)
return rec_daddr < info->low_daddr;
if (info->low.rm_blockcount)
return xfs_rmap_compare(rec, &info->low) < 0;
@@ -294,6 +296,18 @@ xfs_getfsmap_helper(
return 0;
}
+ /*
+ * For an info->last query, we're looking for a gap between the last
+ * mapping emitted and the high key specified by userspace. If the
+ * user's query spans less than 1 fsblock, then info->high and
+ * info->low will have the same rm_startblock, which causes rec_daddr
+ * and next_daddr to be the same. Therefore, use the end_daddr that
+ * we calculated from userspace's high key to synthesize the record.
+ * Note that if the btree query found a mapping, there won't be a gap.
+ */
+ if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL)
+ rec_daddr = info->end_daddr;
+
/* Are we just counting mappings? */
if (info->head->fmh_count == 0) {
if (info->head->fmh_entries == UINT_MAX)
@@ -427,141 +441,6 @@ xfs_getfsmap_set_irec_flags(
irec->rm_flags |= XFS_RMAP_UNWRITTEN;
}
-/* Execute a getfsmap query against the log device. */
-STATIC int
-xfs_getfsmap_logdev(
- struct xfs_trans *tp,
- const struct xfs_fsmap *keys,
- struct xfs_getfsmap_info *info)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_rmap_irec rmap;
- xfs_daddr_t rec_daddr, len_daddr;
- xfs_fsblock_t start_fsb, end_fsb;
- uint64_t eofs;
-
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
- if (keys[0].fmr_physical >= eofs)
- return 0;
- start_fsb = XFS_BB_TO_FSBT(mp,
- keys[0].fmr_physical + keys[0].fmr_length);
- end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
-
- /* Adjust the low key if we are continuing from where we left off. */
- if (keys[0].fmr_length > 0)
- info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb);
-
- trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb);
- trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb);
-
- if (start_fsb > 0)
- return 0;
-
- /* Fabricate an rmap entry for the external log device. */
- rmap.rm_startblock = 0;
- rmap.rm_blockcount = mp->m_sb.sb_logblocks;
- rmap.rm_owner = XFS_RMAP_OWN_LOG;
- rmap.rm_offset = 0;
- rmap.rm_flags = 0;
-
- rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock);
- len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount);
- return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr);
-}
-
-#ifdef CONFIG_XFS_RT
-/* Transform a rtbitmap "record" into a fsmap */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap_helper(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- const struct xfs_rtalloc_rec *rec,
- void *priv)
-{
- struct xfs_getfsmap_info *info = priv;
- struct xfs_rmap_irec irec;
- xfs_rtblock_t rtbno;
- xfs_daddr_t rec_daddr, len_daddr;
-
- rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
- rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
- irec.rm_startblock = rtbno;
-
- rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- len_daddr = XFS_FSB_TO_BB(mp, rtbno);
- irec.rm_blockcount = rtbno;
-
- irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
- irec.rm_offset = 0;
- irec.rm_flags = 0;
-
- return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr);
-}
-
-/* Execute a getfsmap query against the realtime device rtbitmap. */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap(
- struct xfs_trans *tp,
- const struct xfs_fsmap *keys,
- struct xfs_getfsmap_info *info)
-{
-
- struct xfs_rtalloc_rec alow = { 0 };
- struct xfs_rtalloc_rec ahigh = { 0 };
- struct xfs_mount *mp = tp->t_mountp;
- xfs_rtblock_t start_rtb;
- xfs_rtblock_t end_rtb;
- uint64_t eofs;
- int error;
-
- eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
- if (keys[0].fmr_physical >= eofs)
- return 0;
- start_rtb = XFS_BB_TO_FSBT(mp,
- keys[0].fmr_physical + keys[0].fmr_length);
- end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
-
- info->missing_owner = XFS_FMR_OWN_UNKNOWN;
-
- /* Adjust the low key if we are continuing from where we left off. */
- if (keys[0].fmr_length > 0) {
- info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb);
- if (info->low_daddr >= eofs)
- return 0;
- }
-
- trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
- trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
-
- xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
-
- /*
- * Set up query parameters to return free rtextents covering the range
- * we want.
- */
- alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb);
- ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb);
- error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
- xfs_getfsmap_rtdev_rtbitmap_helper, info);
- if (error)
- goto err;
-
- /*
- * Report any gaps at the end of the rtbitmap by simulating a null
- * rmap starting at the block after the end of the query range.
- */
- info->last = true;
- ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
-
- error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
- if (error)
- goto err;
-err:
- xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
- return error;
-}
-#endif /* CONFIG_XFS_RT */
-
static inline bool
rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r)
{
@@ -786,6 +665,140 @@ xfs_getfsmap_datadev_bnobt(
xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
}
+/* Execute a getfsmap query against the log device. */
+STATIC int
+xfs_getfsmap_logdev(
+ struct xfs_trans *tp,
+ const struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rmap_irec rmap;
+ xfs_daddr_t rec_daddr, len_daddr;
+ xfs_fsblock_t start_fsb, end_fsb;
+ uint64_t eofs;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ start_fsb = XFS_BB_TO_FSBT(mp,
+ keys[0].fmr_physical + keys[0].fmr_length);
+ end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (keys[0].fmr_length > 0)
+ info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb);
+
+ trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb);
+ trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb);
+
+ if (start_fsb > 0)
+ return 0;
+
+ /* Fabricate an rmap entry for the external log device. */
+ rmap.rm_startblock = 0;
+ rmap.rm_blockcount = mp->m_sb.sb_logblocks;
+ rmap.rm_owner = XFS_RMAP_OWN_LOG;
+ rmap.rm_offset = 0;
+ rmap.rm_flags = 0;
+
+ rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock);
+ len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount);
+ return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr);
+}
+
+#ifdef CONFIG_XFS_RT
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_rtblock_t rtbno;
+ xfs_daddr_t rec_daddr, len_daddr;
+
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
+ irec.rm_startblock = rtbno;
+
+ rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+ len_daddr = XFS_FSB_TO_BB(mp, rtbno);
+ irec.rm_blockcount = rtbno;
+
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr);
+}
+
+/* Execute a getfsmap query against the realtime device rtbitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap(
+ struct xfs_trans *tp,
+ const struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+
+ struct xfs_rtalloc_rec ahigh = { 0 };
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rtblock_t start_rtb;
+ xfs_rtblock_t end_rtb;
+ xfs_rtxnum_t high;
+ uint64_t eofs;
+ int error;
+
+ eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ start_rtb = XFS_BB_TO_FSBT(mp,
+ keys[0].fmr_physical + keys[0].fmr_length);
+ end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
+
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+
+ /* Adjust the low key if we are continuing from where we left off. */
+ if (keys[0].fmr_length > 0) {
+ info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb);
+ if (info->low_daddr >= eofs)
+ return 0;
+ }
+
+ trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
+ trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
+
+ xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
+
+ /*
+ * Set up query parameters to return free rtextents covering the range
+ * we want.
+ */
+ high = xfs_rtb_to_rtxup(mp, end_rtb);
+ error = xfs_rtalloc_query_range(mp, tp, xfs_rtb_to_rtx(mp, start_rtb),
+ high, xfs_getfsmap_rtdev_rtbitmap_helper, info);
+ if (error)
+ goto err;
+
+ /*
+ * Report any gaps at the end of the rtbitmap by simulating a null
+ * rmap starting at the block after the end of the query range.
+ */
+ info->last = true;
+ ahigh.ar_startext = min(mp->m_sb.sb_rextents, high);
+
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
+ if (error)
+ goto err;
+err:
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+ return error;
+}
+#endif /* CONFIG_XFS_RT */
+
/* Do we recognize the device? */
STATIC bool
xfs_getfsmap_is_valid_device(
@@ -876,7 +889,7 @@ xfs_getfsmap_check_keys(
* xfs_getfsmap_info.low/high -- per-AG low/high keys computed from
* dkeys; used to query the metadata.
*/
-int
+STATIC int
xfs_getfsmap(
struct xfs_mount *mp,
struct xfs_fsmap_head *head,
@@ -904,17 +917,21 @@ xfs_getfsmap(
/* Set up our device handlers. */
memset(handlers, 0, sizeof(handlers));
+ handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
if (use_rmap)
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
else
handlers[0].fn = xfs_getfsmap_datadev_bnobt;
if (mp->m_logdev_targp != mp->m_ddev_targp) {
+ handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
+ mp->m_sb.sb_logblocks);
handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
handlers[1].fn = xfs_getfsmap_logdev;
}
#ifdef CONFIG_XFS_RT
if (mp->m_rtdev_targp) {
+ handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
}
@@ -946,6 +963,7 @@ xfs_getfsmap(
info.next_daddr = head->fmh_keys[0].fmr_physical +
head->fmh_keys[0].fmr_length;
+ info.end_daddr = XFS_BUF_DADDR_NULL;
info.fsmap_recs = fsmap_recs;
info.head = head;
@@ -966,8 +984,11 @@ xfs_getfsmap(
* low key, zero out the low key so that we get
* everything from the beginning.
*/
- if (handlers[i].dev == head->fmh_keys[1].fmr_device)
+ if (handlers[i].dev == head->fmh_keys[1].fmr_device) {
dkeys[1] = head->fmh_keys[1];
+ info.end_daddr = min(handlers[i].nr_sectors - 1,
+ dkeys[1].fmr_physical);
+ }
if (handlers[i].dev > head->fmh_keys[0].fmr_device)
memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
@@ -983,7 +1004,7 @@ xfs_getfsmap(
info.dev = handlers[i].dev;
info.last = false;
info.pag = NULL;
- info.low_daddr = -1ULL;
+ info.low_daddr = XFS_BUF_DADDR_NULL;
info.low.rm_blockcount = 0;
error = handlers[i].fn(tp, dkeys, &info);
if (error)
@@ -998,3 +1019,133 @@ xfs_getfsmap(
head->fmh_oflags = FMH_OF_DEV_T;
return error;
}
+
+int
+xfs_ioc_getfsmap(
+ struct xfs_inode *ip,
+ struct fsmap_head __user *arg)
+{
+ struct xfs_fsmap_head xhead = {0};
+ struct fsmap_head head;
+ struct fsmap *recs;
+ unsigned int count;
+ __u32 last_flags = 0;
+ bool done = false;
+ int error;
+
+ if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+ return -EFAULT;
+ if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+ memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+ sizeof(head.fmh_keys[0].fmr_reserved)) ||
+ memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+ sizeof(head.fmh_keys[1].fmr_reserved)))
+ return -EINVAL;
+
+ /*
+ * Use an internal memory buffer so that we don't have to copy fsmap
+ * data to userspace while holding locks. Start by trying to allocate
+ * up to 128k for the buffer, but fall back to a single page if needed.
+ */
+ count = min_t(unsigned int, head.fmh_count,
+ 131072 / sizeof(struct fsmap));
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs) {
+ count = min_t(unsigned int, head.fmh_count,
+ PAGE_SIZE / sizeof(struct fsmap));
+ recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
+ if (!recs)
+ return -ENOMEM;
+ }
+
+ xhead.fmh_iflags = head.fmh_iflags;
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
+ xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
+
+ head.fmh_entries = 0;
+ do {
+ struct fsmap __user *user_recs;
+ struct fsmap *last_rec;
+
+ user_recs = &arg->fmh_recs[head.fmh_entries];
+ xhead.fmh_entries = 0;
+ xhead.fmh_count = min_t(unsigned int, count,
+ head.fmh_count - head.fmh_entries);
+
+ /* Run query, record how many entries we got. */
+ error = xfs_getfsmap(ip->i_mount, &xhead, recs);
+ switch (error) {
+ case 0:
+ /*
+ * There are no more records in the result set. Copy
+ * whatever we got to userspace and break out.
+ */
+ done = true;
+ break;
+ case -ECANCELED:
+ /*
+ * The internal memory buffer is full. Copy whatever
+ * records we got to userspace and go again if we have
+ * not yet filled the userspace buffer.
+ */
+ error = 0;
+ break;
+ default:
+ goto out_free;
+ }
+ head.fmh_entries += xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
+
+ /*
+ * If the caller wanted a record count or there aren't any
+ * new records to return, we're done.
+ */
+ if (head.fmh_count == 0 || xhead.fmh_entries == 0)
+ break;
+
+ /* Copy all the records we got out to userspace. */
+ if (copy_to_user(user_recs, recs,
+ xhead.fmh_entries * sizeof(struct fsmap))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+ /* Remember the last record flags we copied to userspace. */
+ last_rec = &recs[xhead.fmh_entries - 1];
+ last_flags = last_rec->fmr_flags;
+
+ /* Set up the low key for the next iteration. */
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ } while (!done && head.fmh_entries < head.fmh_count);
+
+ /*
+ * If there are no more records in the query result set and we're not
+ * in counting mode, mark the last record returned with the LAST flag.
+ */
+ if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
+ struct fsmap __user *user_rec;
+
+ last_flags |= FMR_OF_LAST;
+ user_rec = &arg->fmh_recs[head.fmh_entries - 1];
+
+ if (copy_to_user(&user_rec->fmr_flags, &last_flags,
+ sizeof(last_flags))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+ }
+
+ /* copy back header */
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+out_free:
+ kvfree(recs);
+ return error;
+}
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
index a0775788e7b1..a0bcc38486a5 100644
--- a/fs/xfs/xfs_fsmap.h
+++ b/fs/xfs/xfs_fsmap.h
@@ -7,6 +7,7 @@
#define __XFS_FSMAP_H__
struct fsmap;
+struct fsmap_head;
/* internal fsmap representation */
struct xfs_fsmap {
@@ -27,9 +28,6 @@ struct xfs_fsmap_head {
struct xfs_fsmap fmh_keys[2]; /* low and high keys */
};
-void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
-
-int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
- struct fsmap *out_recs);
+int xfs_ioc_getfsmap(struct xfs_inode *ip, struct fsmap_head __user *arg);
#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c211ea2b63c4..3643cc843f62 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -485,7 +485,7 @@ xfs_do_force_shutdown(
const char *why;
- if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
+ if (xfs_set_shutdown(mp)) {
xlog_shutdown_wait(mp->m_log);
return;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index cf629302d48e..20d9924f28c2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -65,6 +65,18 @@ static int xfs_icwalk_ag(struct xfs_perag *pag,
XFS_ICWALK_FLAG_RECLAIM_SICK | \
XFS_ICWALK_FLAG_UNION)
+/* Marks for the perag xarray */
+#define XFS_PERAG_RECLAIM_MARK XA_MARK_0
+#define XFS_PERAG_BLOCKGC_MARK XA_MARK_1
+
+static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
+{
+ if (tag == XFS_ICI_RECLAIM_TAG)
+ return XFS_PERAG_RECLAIM_MARK;
+ ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
+ return XFS_PERAG_BLOCKGC_MARK;
+}
+
/*
* Allocate and initialise an xfs_inode.
*/
@@ -191,7 +203,7 @@ xfs_reclaim_work_queue(
{
rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
}
@@ -241,9 +253,7 @@ xfs_perag_set_inode_tag(
return;
/* propagate the tag up into the perag radix tree */
- spin_lock(&mp->m_perag_lock);
- radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
- spin_unlock(&mp->m_perag_lock);
+ xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
/* start background work */
switch (tag) {
@@ -285,14 +295,39 @@ xfs_perag_clear_inode_tag(
return;
/* clear the tag from the perag radix tree */
- spin_lock(&mp->m_perag_lock);
- radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
- spin_unlock(&mp->m_perag_lock);
+ xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
}
/*
+ * Find the next AG after @pag, or the first AG if @pag is NULL.
+ */
+static struct xfs_perag *
+xfs_perag_grab_next_tag(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ int tag)
+{
+ unsigned long index = 0;
+
+ if (pag) {
+ index = pag->pag_agno + 1;
+ xfs_perag_rele(pag);
+ }
+
+ rcu_read_lock();
+ pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
+ if (pag) {
+ trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
+ if (!atomic_inc_not_zero(&pag->pag_active_ref))
+ pag = NULL;
+ }
+ rcu_read_unlock();
+ return pag;
+}
+
+/*
* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
* part of the structure. This is made more complex by the fact we store
* information about the on-disk values in the VFS inode and so we can't just
@@ -755,7 +790,7 @@ xfs_iget(
ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
/* reject inode numbers outside existing AGs */
- if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+ if (!xfs_verify_ino(mp, ino))
return -EINVAL;
XFS_STATS_INC(mp, xs_ig_attempts);
@@ -977,7 +1012,7 @@ xfs_reclaim_inodes(
if (xfs_want_reclaim_sick(mp))
icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
- while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
xfs_ail_push_all_sync(mp->m_ail);
xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
}
@@ -1019,15 +1054,17 @@ long
xfs_reclaim_inodes_count(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t ag = 0;
+ XA_STATE (xas, &mp->m_perags, 0);
long reclaimable = 0;
+ struct xfs_perag *pag;
- while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
- ag = pag->pag_agno + 1;
+ rcu_read_lock();
+ xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) {
+ trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
reclaimable += pag->pag_ici_reclaimable;
- xfs_perag_put(pag);
}
+ rcu_read_unlock();
+
return reclaimable;
}
@@ -1159,7 +1196,7 @@ xfs_inode_free_eofblocks(
if (xfs_can_free_eofblocks(ip))
return xfs_free_eofblocks(ip);
- /* inode could be preallocated or append-only */
+ /* inode could be preallocated */
trace_xfs_inode_free_eofblocks_invalid(ip);
xfs_inode_clear_eofblocks_tag(ip);
return 0;
@@ -1369,14 +1406,13 @@ void
xfs_blockgc_start(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
if (xfs_set_blockgc_enabled(mp))
return;
trace_xfs_blockgc_start(mp, __return_address);
- for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
xfs_blockgc_queue(pag);
}
@@ -1492,21 +1528,19 @@ int
xfs_blockgc_flush_all(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
trace_xfs_blockgc_flush_all(mp, __return_address);
/*
- * For each blockgc worker, move its queue time up to now. If it
- * wasn't queued, it will not be requeued. Then flush whatever's
- * left.
+ * For each blockgc worker, move its queue time up to now. If it wasn't
+ * queued, it will not be requeued. Then flush whatever is left.
*/
- for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
mod_delayed_work(pag->pag_mount->m_blockgc_wq,
&pag->pag_blockgc_work, 0);
- for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
flush_delayed_work(&pag->pag_blockgc_work);
return xfs_inodegc_flush(mp);
@@ -1752,12 +1786,11 @@ xfs_icwalk(
enum xfs_icwalk_goal goal,
struct xfs_icwalk *icw)
{
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
int error = 0;
int last_error = 0;
- xfs_agnumber_t agno;
- for_each_perag_tag(mp, agno, pag, goal) {
+ while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) {
error = xfs_icwalk_ag(pag, goal, icw);
if (error) {
last_error = error;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7dc6f326936c..bcc277fc0a83 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -704,7 +704,7 @@ xfs_create(
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
+ error = xfs_dialloc(&tp, args, &ino);
if (!error)
error = xfs_icreate(tp, ino, args, &du.ip);
if (error)
@@ -812,7 +812,7 @@ xfs_create_tmpfile(
if (error)
goto out_release_dquots;
- error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
+ error = xfs_dialloc(&tp, args, &ino);
if (!error)
error = xfs_icreate(tp, ino, args, &ip);
if (error)
@@ -1079,88 +1079,6 @@ out:
return error;
}
-int
-xfs_release(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- int error = 0;
-
- if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
- return 0;
-
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (xfs_is_readonly(mp))
- return 0;
-
- if (!xfs_is_shutdown(mp)) {
- int truncated;
-
- /*
- * If we previously truncated this file and removed old data
- * in the process, we want to initiate "early" writeout on
- * the last close. This is an attempt to combat the notorious
- * NULL files problem which is particularly noticeable from a
- * truncate down, buffered (re-)write (delalloc), followed by
- * a crash. What we are effectively doing here is
- * significantly reducing the time window where we'd otherwise
- * be exposed to that problem.
- */
- truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
- if (truncated) {
- xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
- if (ip->i_delayed_blks > 0) {
- error = filemap_flush(VFS_I(ip)->i_mapping);
- if (error)
- return error;
- }
- }
- }
-
- if (VFS_I(ip)->i_nlink == 0)
- return 0;
-
- /*
- * If we can't get the iolock just skip truncating the blocks past EOF
- * because we could deadlock with the mmap_lock otherwise. We'll get
- * another chance to drop them once the last reference to the inode is
- * dropped, so we'll never leak blocks permanently.
- */
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
- return 0;
-
- if (xfs_can_free_eofblocks(ip)) {
- /*
- * Check if the inode is being opened, written and closed
- * frequently and we have delayed allocation blocks outstanding
- * (e.g. streaming writes from the NFS server), truncating the
- * blocks past EOF will cause fragmentation to occur.
- *
- * In this case don't do the truncation, but we have to be
- * careful how we detect this case. Blocks beyond EOF show up as
- * i_delayed_blks even when the inode is clean, so we need to
- * truncate them away first before checking for a dirty release.
- * Hence on the first dirty close we will still remove the
- * speculative allocation, but after that we will leave it in
- * place.
- */
- if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
- goto out_unlock;
-
- error = xfs_free_eofblocks(ip);
- if (error)
- goto out_unlock;
-
- /* delalloc blocks after truncation means it really is dirty */
- if (ip->i_delayed_blks)
- xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
- }
-
-out_unlock:
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
-}
-
/*
* Mark all the buffers attached to this directory stale. In theory we should
* never be freeing a directory with any blocks at all, but this covers the
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 51defdebef30..97ed912306fd 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -276,12 +276,13 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
}
-static inline bool xfs_is_metadata_inode(struct xfs_inode *ip)
+static inline bool xfs_is_metadata_inode(const struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- return ip == mp->m_rbmip || ip == mp->m_rsumip ||
- xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
+ return ip->i_ino == mp->m_sb.sb_rbmino ||
+ ip->i_ino == mp->m_sb.sb_rsumino ||
+ xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
bool xfs_is_always_cow_inode(struct xfs_inode *ip);
@@ -335,7 +336,7 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
#define XFS_INEW (1 << 3) /* inode has just been allocated */
#define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
+#define XFS_EOFBLOCKS_RELEASED (1 << 6) /* eofblocks were freed in ->release */
#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
@@ -382,7 +383,7 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
*/
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
- XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
+ XFS_EOFBLOCKS_RELEASED | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
XFS_INACTIVATING | XFS_IQUOTAUNCHECKED)
/*
@@ -512,7 +513,6 @@ enum layout_break_reason {
#define XFS_INHERIT_GID(pip) \
(xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
-int xfs_release(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4e933db75b12..7226d27e8afc 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -483,6 +483,17 @@ xfs_ioctl_setattr_xflags(
/* Can't change realtime flag if any extents are allocated. */
if (ip->i_df.if_nextents || ip->i_delayed_blks)
return -EINVAL;
+
+ /*
+ * If S_DAX is enabled on this file, we can only switch the
+ * device if both support fsdax. We can't update S_DAX because
+ * there might be other threads walking down the access paths.
+ */
+ if (IS_DAX(VFS_I(ip)) &&
+ (mp->m_ddev_targp->bt_daxdev == NULL ||
+ (mp->m_rtdev_targp &&
+ mp->m_rtdev_targp->bt_daxdev == NULL)))
+ return -EINVAL;
}
if (rtflag) {
@@ -865,136 +876,6 @@ out_free_buf:
return error;
}
-STATIC int
-xfs_ioc_getfsmap(
- struct xfs_inode *ip,
- struct fsmap_head __user *arg)
-{
- struct xfs_fsmap_head xhead = {0};
- struct fsmap_head head;
- struct fsmap *recs;
- unsigned int count;
- __u32 last_flags = 0;
- bool done = false;
- int error;
-
- if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
- return -EFAULT;
- if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
- memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
- sizeof(head.fmh_keys[0].fmr_reserved)) ||
- memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
- sizeof(head.fmh_keys[1].fmr_reserved)))
- return -EINVAL;
-
- /*
- * Use an internal memory buffer so that we don't have to copy fsmap
- * data to userspace while holding locks. Start by trying to allocate
- * up to 128k for the buffer, but fall back to a single page if needed.
- */
- count = min_t(unsigned int, head.fmh_count,
- 131072 / sizeof(struct fsmap));
- recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
- if (!recs) {
- count = min_t(unsigned int, head.fmh_count,
- PAGE_SIZE / sizeof(struct fsmap));
- recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
- if (!recs)
- return -ENOMEM;
- }
-
- xhead.fmh_iflags = head.fmh_iflags;
- xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
- xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
-
- trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
- trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
-
- head.fmh_entries = 0;
- do {
- struct fsmap __user *user_recs;
- struct fsmap *last_rec;
-
- user_recs = &arg->fmh_recs[head.fmh_entries];
- xhead.fmh_entries = 0;
- xhead.fmh_count = min_t(unsigned int, count,
- head.fmh_count - head.fmh_entries);
-
- /* Run query, record how many entries we got. */
- error = xfs_getfsmap(ip->i_mount, &xhead, recs);
- switch (error) {
- case 0:
- /*
- * There are no more records in the result set. Copy
- * whatever we got to userspace and break out.
- */
- done = true;
- break;
- case -ECANCELED:
- /*
- * The internal memory buffer is full. Copy whatever
- * records we got to userspace and go again if we have
- * not yet filled the userspace buffer.
- */
- error = 0;
- break;
- default:
- goto out_free;
- }
- head.fmh_entries += xhead.fmh_entries;
- head.fmh_oflags = xhead.fmh_oflags;
-
- /*
- * If the caller wanted a record count or there aren't any
- * new records to return, we're done.
- */
- if (head.fmh_count == 0 || xhead.fmh_entries == 0)
- break;
-
- /* Copy all the records we got out to userspace. */
- if (copy_to_user(user_recs, recs,
- xhead.fmh_entries * sizeof(struct fsmap))) {
- error = -EFAULT;
- goto out_free;
- }
-
- /* Remember the last record flags we copied to userspace. */
- last_rec = &recs[xhead.fmh_entries - 1];
- last_flags = last_rec->fmr_flags;
-
- /* Set up the low key for the next iteration. */
- xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
- trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
- } while (!done && head.fmh_entries < head.fmh_count);
-
- /*
- * If there are no more records in the query result set and we're not
- * in counting mode, mark the last record returned with the LAST flag.
- */
- if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
- struct fsmap __user *user_rec;
-
- last_flags |= FMR_OF_LAST;
- user_rec = &arg->fmh_recs[head.fmh_entries - 1];
-
- if (copy_to_user(&user_rec->fmr_flags, &last_flags,
- sizeof(last_flags))) {
- error = -EFAULT;
- goto out_free;
- }
- }
-
- /* copy back header */
- if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
- error = -EFAULT;
- goto out_free;
- }
-
-out_free:
- kvfree(recs);
- return error;
-}
-
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -1507,6 +1388,10 @@ xfs_file_ioctl(
case XFS_IOC_EXCHANGE_RANGE:
return xfs_ioc_exchange_range(filp, arg);
+ case XFS_IOC_START_COMMIT:
+ return xfs_ioc_start_commit(filp, arg);
+ case XFS_IOC_COMMIT_RANGE:
+ return xfs_ioc_commit_range(filp, arg);
default:
return -ENOTTY;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 817ea7e0a8ab..26b2f5887b88 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3495,7 +3495,7 @@ xlog_force_shutdown(
* If this log shutdown also sets the mount shutdown state, issue a
* shutdown warning message.
*/
- if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) {
+ if (!xfs_set_shutdown(log->l_mp)) {
xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
"Filesystem has been shut down due to log error (0x%x).",
shutdown_flags);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4423dd344239..1a74fe22672e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1336,7 +1336,7 @@ xlog_find_tail(
* headers if we have a filesystem using non-persistent counters.
*/
if (clean)
- set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
+ xfs_set_clean(log->l_mp);
/*
* Make sure that there are no blocks in front of the head
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 09eef1721ef4..460f93a9ce00 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -595,7 +595,7 @@ xfs_unmount_flush_inodes(
xfs_extent_busy_wait_all(mp);
flush_workqueue(xfs_discard_wq);
- set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate);
+ xfs_set_unmounting(mp);
xfs_ail_push_all_sync(mp->m_ail);
xfs_inodegc_stop(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d0567dfbc036..96496f39f551 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -147,7 +147,7 @@ typedef struct xfs_mount {
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
uint m_rsumlevels; /* rt summary levels */
- uint m_rsumsize; /* size of rt summary, bytes */
+ xfs_filblks_t m_rsumblocks; /* size of rt summary, FSBs */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_qflags; /* quota status flags */
uint64_t m_features; /* active filesystem features */
@@ -208,8 +208,7 @@ typedef struct xfs_mount {
*/
atomic64_t m_allocbt_blks;
- struct radix_tree_root m_perag_tree; /* per-ag accounting info */
- spinlock_t m_perag_lock; /* lock for m_perag_tree */
+ struct xarray m_perags; /* per-ag accounting info */
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 7443debaffd6..d0f5b403bdbe 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -230,9 +230,8 @@ _xfs_mru_cache_clear_reap_list(
__releases(mru->lock) __acquires(mru->lock)
{
struct xfs_mru_cache_elem *elem, *next;
- struct list_head tmp;
+ LIST_HEAD(tmp);
- INIT_LIST_HEAD(&tmp);
list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) {
/* Remove the element from the data store. */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 9490b913a4ab..7e2307921deb 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -799,7 +799,7 @@ xfs_qm_qino_alloc(
};
xfs_ino_t ino;
- error = xfs_dialloc(&tp, 0, S_IFREG, &ino);
+ error = xfs_dialloc(&tp, &args, &ino);
if (!error)
error = xfs_icreate(tp, ino, &args, ipp);
if (error) {
@@ -1539,6 +1539,43 @@ xfs_qm_mount_quotas(
}
/*
+ * Load the inode for a given type of quota, assuming that the sb fields have
+ * been sorted out. This is not true when switching quota types on a V4
+ * filesystem, so do not use this function for that.
+ *
+ * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on
+ * success; or a negative errno.
+ */
+int
+xfs_qm_qino_load(
+ struct xfs_mount *mp,
+ xfs_dqtype_t type,
+ struct xfs_inode **ipp)
+{
+ xfs_ino_t ino = NULLFSINO;
+
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ ino = mp->m_sb.sb_uquotino;
+ break;
+ case XFS_DQTYPE_GROUP:
+ ino = mp->m_sb.sb_gquotino;
+ break;
+ case XFS_DQTYPE_PROJ:
+ ino = mp->m_sb.sb_pquotino;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ if (ino == NULLFSINO)
+ return -ENOENT;
+
+ return xfs_iget(mp, NULL, ino, 0, 0, ipp);
+}
+
+/*
* This is called after the superblock has been read in and we're ready to
* iget the quota inodes.
*/
@@ -1561,24 +1598,21 @@ xfs_qm_init_quotainos(
if (XFS_IS_UQUOTA_ON(mp) &&
mp->m_sb.sb_uquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_uquotino > 0);
- error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip);
+ error = xfs_qm_qino_load(mp, XFS_DQTYPE_USER, &uip);
if (error)
return error;
}
if (XFS_IS_GQUOTA_ON(mp) &&
mp->m_sb.sb_gquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_gquotino > 0);
- error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip);
+ error = xfs_qm_qino_load(mp, XFS_DQTYPE_GROUP, &gip);
if (error)
goto error_rele;
}
if (XFS_IS_PQUOTA_ON(mp) &&
mp->m_sb.sb_pquotino != NULLFSINO) {
ASSERT(mp->m_sb.sb_pquotino > 0);
- error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
- 0, 0, &pip);
+ error = xfs_qm_qino_load(mp, XFS_DQTYPE_PROJ, &pip);
if (error)
goto error_rele;
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 6e09dfcd13e2..e919c7f62f57 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -184,4 +184,7 @@ xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type)
}
}
+int xfs_qm_qino_load(struct xfs_mount *mp, xfs_dqtype_t type,
+ struct xfs_inode **ipp);
+
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 392cb39cc10c..4eda50ae2d1c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -53,16 +53,15 @@ xfs_qm_scall_quotaoff(
STATIC int
xfs_qm_scall_trunc_qfile(
struct xfs_mount *mp,
- xfs_ino_t ino)
+ xfs_dqtype_t type)
{
struct xfs_inode *ip;
struct xfs_trans *tp;
int error;
- if (ino == NULLFSINO)
+ error = xfs_qm_qino_load(mp, type, &ip);
+ if (error == -ENOENT)
return 0;
-
- error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
if (error)
return error;
@@ -113,17 +112,17 @@ xfs_qm_scall_trunc_qfiles(
}
if (flags & XFS_QMOPT_UQUOTA) {
- error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+ error = xfs_qm_scall_trunc_qfile(mp, XFS_DQTYPE_USER);
if (error)
return error;
}
if (flags & XFS_QMOPT_GQUOTA) {
- error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+ error = xfs_qm_scall_trunc_qfile(mp, XFS_DQTYPE_GROUP);
if (error)
return error;
}
if (flags & XFS_QMOPT_PQUOTA)
- error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
+ error = xfs_qm_scall_trunc_qfile(mp, XFS_DQTYPE_PROJ);
return error;
}
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 9c162e69976b..4c7f7ce4fd2f 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -16,24 +16,25 @@
#include "xfs_qm.h"
-static void
+static int
xfs_qm_fill_state(
struct qc_type_state *tstate,
struct xfs_mount *mp,
- struct xfs_inode *ip,
- xfs_ino_t ino,
- struct xfs_def_quota *defq)
+ xfs_dqtype_t type)
{
- bool tempqip = false;
-
- tstate->ino = ino;
- if (!ip && ino == NULLFSINO)
- return;
- if (!ip) {
- if (xfs_iget(mp, NULL, ino, 0, 0, &ip))
- return;
- tempqip = true;
+ struct xfs_inode *ip;
+ struct xfs_def_quota *defq;
+ int error;
+
+ error = xfs_qm_qino_load(mp, type, &ip);
+ if (error) {
+ tstate->ino = NULLFSINO;
+ return error != -ENOENT ? error : 0;
}
+
+ defq = xfs_get_defquota(mp->m_quotainfo, type);
+
+ tstate->ino = ip->i_ino;
tstate->flags |= QCI_SYSFILE;
tstate->blocks = ip->i_nblocks;
tstate->nextents = ip->i_df.if_nextents;
@@ -43,8 +44,9 @@ xfs_qm_fill_state(
tstate->spc_warnlimit = 0;
tstate->ino_warnlimit = 0;
tstate->rt_spc_warnlimit = 0;
- if (tempqip)
- xfs_irele(ip);
+ xfs_irele(ip);
+
+ return 0;
}
/*
@@ -56,8 +58,9 @@ xfs_fs_get_quota_state(
struct super_block *sb,
struct qc_state *state)
{
- struct xfs_mount *mp = XFS_M(sb);
- struct xfs_quotainfo *q = mp->m_quotainfo;
+ struct xfs_mount *mp = XFS_M(sb);
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ int error;
memset(state, 0, sizeof(*state));
if (!XFS_IS_QUOTA_ON(mp))
@@ -76,12 +79,18 @@ xfs_fs_get_quota_state(
if (XFS_IS_PQUOTA_ENFORCED(mp))
state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED;
- xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip,
- mp->m_sb.sb_uquotino, &q->qi_usr_default);
- xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip,
- mp->m_sb.sb_gquotino, &q->qi_grp_default);
- xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip,
- mp->m_sb.sb_pquotino, &q->qi_prj_default);
+ error = xfs_qm_fill_state(&state->s_state[USRQUOTA], mp,
+ XFS_DQTYPE_USER);
+ if (error)
+ return error;
+ error = xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp,
+ XFS_DQTYPE_GROUP);
+ if (error)
+ return error;
+ error = xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp,
+ XFS_DQTYPE_PROJ);
+ if (error)
+ return error;
return 0;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 0c3e96c621a6..3a2005a1e673 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -142,7 +142,7 @@ xfs_rtallocate_range(
* We need to find the beginning and end of the extent so we can
* properly update the summary.
*/
- error = xfs_rtfind_back(args, start, 0, &preblock);
+ error = xfs_rtfind_back(args, start, &preblock);
if (error)
return error;
@@ -194,6 +194,17 @@ xfs_rtallocate_range(
return xfs_rtmodify_range(args, start, len, 0);
}
+/* Reduce @rtxlen until it is a multiple of @prod. */
+static inline xfs_rtxlen_t
+xfs_rtalloc_align_len(
+ xfs_rtxlen_t rtxlen,
+ xfs_rtxlen_t prod)
+{
+ if (unlikely(prod > 1))
+ return rounddown(rtxlen, prod);
+ return rtxlen;
+}
+
/*
* Make sure we don't run off the end of the rt volume. Be careful that
* adjusting maxlen downwards doesn't cause us to fail the alignment checks.
@@ -208,7 +219,7 @@ xfs_rtallocate_clamp_len(
xfs_rtxlen_t ret;
ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx;
- return rounddown(ret, prod);
+ return xfs_rtalloc_align_len(ret, prod);
}
/*
@@ -229,39 +240,40 @@ xfs_rtallocate_extent_block(
xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
struct xfs_mount *mp = args->mp;
- xfs_rtxnum_t besti; /* best rtext found so far */
- xfs_rtxnum_t bestlen;/* best length found so far */
+ xfs_rtxnum_t besti = -1; /* best rtext found so far */
xfs_rtxnum_t end; /* last rtext in chunk */
- int error;
xfs_rtxnum_t i; /* current rtext trying */
xfs_rtxnum_t next; /* next rtext to try */
+ xfs_rtxlen_t scanlen; /* number of free rtx to look for */
+ xfs_rtxlen_t bestlen = 0; /* best length found so far */
int stat; /* status from internal calls */
+ int error;
/*
- * Loop over all the extents starting in this bitmap block,
- * looking for one that's long enough.
+ * Loop over all the extents starting in this bitmap block up to the
+ * end of the rt volume, looking for one that's long enough.
*/
- for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0,
- end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1;
- i <= end;
- i++) {
+ end = min(mp->m_sb.sb_rextents, xfs_rbmblock_to_rtx(mp, bbno + 1)) - 1;
+ for (i = xfs_rbmblock_to_rtx(mp, bbno); i <= end; i++) {
/* Make sure we don't scan off the end of the rt volume. */
- maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod);
+ scanlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod);
+ if (scanlen < minlen)
+ break;
/*
- * See if there's a free extent of maxlen starting at i.
+ * See if there's a free extent of scanlen starting at i.
* If it's not so then next will contain the first non-free.
*/
- error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
+ error = xfs_rtcheck_range(args, i, scanlen, 1, &next, &stat);
if (error)
return error;
if (stat) {
/*
- * i for maxlen is all free, allocate and return that.
+ * i to scanlen is all free, allocate and return that.
*/
- bestlen = maxlen;
- besti = i;
- goto allocate;
+ *len = scanlen;
+ *rtx = i;
+ return 0;
}
/*
@@ -289,38 +301,28 @@ xfs_rtallocate_extent_block(
return error;
}
- /*
- * Searched the whole thing & didn't find a maxlen free extent.
- */
- if (minlen > maxlen || besti == -1) {
- /*
- * Allocation failed. Set *nextp to the next block to try.
- */
- *nextp = next;
- return -ENOSPC;
- }
+ /* Searched the whole thing & didn't find a maxlen free extent. */
+ if (besti == -1)
+ goto nospace;
/*
- * If size should be a multiple of prod, make that so.
+ * Ensure bestlen is a multiple of prod, but don't return a too-short
+ * extent.
*/
- if (prod > 1) {
- xfs_rtxlen_t p; /* amount to trim length by */
-
- div_u64_rem(bestlen, prod, &p);
- if (p)
- bestlen -= p;
- }
+ bestlen = xfs_rtalloc_align_len(bestlen, prod);
+ if (bestlen < minlen)
+ goto nospace;
/*
- * Allocate besti for bestlen & return that.
+ * Pick besti for bestlen & return that.
*/
-allocate:
- error = xfs_rtallocate_range(args, besti, bestlen);
- if (error)
- return error;
*len = bestlen;
*rtx = besti;
return 0;
+nospace:
+ /* Allocation failed. Set *nextp to the next block to try. */
+ *nextp = next;
+ return -ENOSPC;
}
/*
@@ -339,45 +341,46 @@ xfs_rtallocate_extent_exact(
xfs_rtxlen_t prod, /* extent product factor */
xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- int error;
- xfs_rtxlen_t i; /* extent length trimmed due to prod */
- int isfree; /* extent is free */
+ struct xfs_mount *mp = args->mp;
xfs_rtxnum_t next; /* next rtext to try (dummy) */
+ xfs_rtxlen_t alloclen; /* candidate length */
+ xfs_rtxlen_t scanlen; /* number of free rtx to look for */
+ int isfree; /* extent is free */
+ int error;
ASSERT(minlen % prod == 0);
ASSERT(maxlen % prod == 0);
- /*
- * Check if the range in question (for maxlen) is free.
- */
- error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
+
+ /* Make sure we don't run off the end of the rt volume. */
+ scanlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
+ if (scanlen < minlen)
+ return -ENOSPC;
+
+ /* Check if the range in question (for scanlen) is free. */
+ error = xfs_rtcheck_range(args, start, scanlen, 1, &next, &isfree);
if (error)
return error;
- if (!isfree) {
- /*
- * If not, allocate what there is, if it's at least minlen.
- */
- maxlen = next - start;
- if (maxlen < minlen)
- return -ENOSPC;
-
- /*
- * Trim off tail of extent, if prod is specified.
- */
- if (prod > 1 && (i = maxlen % prod)) {
- maxlen -= i;
- if (maxlen < minlen)
- return -ENOSPC;
- }
+ if (isfree) {
+ /* start to scanlen is all free; allocate it. */
+ *len = scanlen;
+ *rtx = start;
+ return 0;
}
/*
- * Allocate what we can and return it.
+ * If not, allocate what there is, if it's at least minlen.
*/
- error = xfs_rtallocate_range(args, start, maxlen);
- if (error)
- return error;
- *len = maxlen;
+ alloclen = next - start;
+ if (alloclen < minlen)
+ return -ENOSPC;
+
+ /* Ensure alloclen is a multiple of prod. */
+ alloclen = xfs_rtalloc_align_len(alloclen, prod);
+ if (alloclen < minlen)
+ return -ENOSPC;
+
+ *len = alloclen;
*rtx = start;
return 0;
}
@@ -416,11 +419,6 @@ xfs_rtallocate_extent_near(
if (start >= mp->m_sb.sb_rextents)
start = mp->m_sb.sb_rextents - 1;
- /* Make sure we don't run off the end of the rt volume. */
- maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
- if (maxlen < minlen)
- return -ENOSPC;
-
/*
* Try the exact allocation first.
*/
@@ -429,7 +427,6 @@ xfs_rtallocate_extent_near(
if (error != -ENOSPC)
return error;
-
bbno = xfs_rtx_to_rbmblock(mp, start);
i = 0;
j = -1;
@@ -552,11 +549,11 @@ xfs_rtalloc_sumlevel(
xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
xfs_fileoff_t i; /* bitmap block number */
+ int error;
for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) {
xfs_suminfo_t sum; /* summary information for extents */
xfs_rtxnum_t n; /* next rtext to be tried */
- int error;
error = xfs_rtget_summary(args, l, i, &sum);
if (error)
@@ -652,141 +649,217 @@ xfs_rtallocate_extent_size(
return -ENOSPC;
}
+static int
+xfs_alloc_rsum_cache(
+ struct xfs_mount *mp,
+ xfs_extlen_t rbmblocks)
+{
+ /*
+ * The rsum cache is initialized to the maximum value, which is
+ * trivially an upper bound on the maximum level with any free extents.
+ */
+ mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
+ if (!mp->m_rsum_cache)
+ return -ENOMEM;
+ memset(mp->m_rsum_cache, -1, rbmblocks);
+ return 0;
+}
+
/*
- * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ * If we changed the rt extent size (meaning there was no rt volume previously)
+ * and the root directory had EXTSZINHERIT and RTINHERIT set, it's possible
+ * that the extent size hint on the root directory is no longer congruent with
+ * the new rt extent size. Log the rootdir inode to fix this.
*/
-STATIC int
-xfs_growfs_rt_alloc(
- struct xfs_mount *mp, /* file system mount point */
- xfs_extlen_t oblocks, /* old count of blocks */
- xfs_extlen_t nblocks, /* new count of blocks */
- struct xfs_inode *ip) /* inode (bitmap/summary) */
+static int
+xfs_growfs_rt_fixup_extsize(
+ struct xfs_mount *mp)
{
- xfs_fileoff_t bno; /* block number in file */
- struct xfs_buf *bp; /* temporary buffer for zeroing */
- xfs_daddr_t d; /* disk block address */
- int error; /* error return value */
- xfs_fsblock_t fsbno; /* filesystem block for bno */
- struct xfs_bmbt_irec map; /* block map output */
- int nmap; /* number of block maps */
- int resblks; /* space reservation */
- enum xfs_blft buf_type;
+ struct xfs_inode *ip = mp->m_rootip;
struct xfs_trans *tp;
+ int error = 0;
- if (ip == mp->m_rsumip)
- buf_type = XFS_BLFT_RTSUMMARY_BUF;
- else
- buf_type = XFS_BLFT_RTBITMAP_BUF;
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (!(ip->i_diflags & XFS_DIFLAG_RTINHERIT) ||
+ !(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT))
+ goto out_iolock;
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange, 0, 0, false,
+ &tp);
+ if (error)
+ goto out_iolock;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+out_iolock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+}
+
+static int
+xfs_growfs_rt_bmblock(
+ struct xfs_mount *mp,
+ xfs_rfsblock_t nrblocks,
+ xfs_agblock_t rextsize,
+ xfs_fileoff_t bmbno)
+{
+ struct xfs_inode *rbmip = mp->m_rbmip;
+ struct xfs_inode *rsumip = mp->m_rsumip;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ };
+ struct xfs_rtalloc_args nargs = {
+ };
+ struct xfs_mount *nmp;
+ xfs_rfsblock_t nrblocks_step;
+ xfs_rtbxlen_t freed_rtx;
+ int error;
+
+
+ nrblocks_step = (bmbno + 1) * NBBY * mp->m_sb.sb_blocksize * rextsize;
+
+ nmp = nargs.mp = kmemdup(mp, sizeof(*mp), GFP_KERNEL);
+ if (!nmp)
+ return -ENOMEM;
/*
- * Allocate space to the file, as necessary.
+ * Calculate new sb and mount fields for this round.
*/
- while (oblocks < nblocks) {
- resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
- /*
- * Reserve space & log for one extent added to the file.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks,
- 0, 0, &tp);
- if (error)
- return error;
- /*
- * Lock the inode.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ nmp->m_sb.sb_rextsize = rextsize;
+ xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb);
+ nmp->m_sb.sb_rbmblocks = bmbno + 1;
+ nmp->m_sb.sb_rblocks = min(nrblocks, nrblocks_step);
+ nmp->m_sb.sb_rextents = xfs_rtb_to_rtx(nmp, nmp->m_sb.sb_rblocks);
+ nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents);
+ nmp->m_rsumlevels = nmp->m_sb.sb_rextslog + 1;
+ nmp->m_rsumblocks = xfs_rtsummary_blockcount(mp, nmp->m_rsumlevels,
+ nmp->m_sb.sb_rbmblocks);
- error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
- XFS_IEXT_ADD_NOSPLIT_CNT);
- if (error)
- goto out_trans_cancel;
+ /*
+ * Recompute the growfsrt reservation from the new rsumsize, so that the
+ * transaction below use the new, potentially larger value.
+ * */
+ xfs_trans_resv_calc(nmp, &nmp->m_resv);
+ error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0,
+ &args.tp);
+ if (error)
+ goto out_free;
+ nargs.tp = args.tp;
- /*
- * Allocate blocks to the bitmap file.
- */
- nmap = 1;
- error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_METADATA, 0, &map, &nmap);
- if (error)
- goto out_trans_cancel;
- /*
- * Free any blocks freed up in the transaction, then commit.
- */
- error = xfs_trans_commit(tp);
- if (error)
- return error;
- /*
- * Now we need to clear the allocated blocks.
- * Do this one block per transaction, to keep it simple.
- */
- for (bno = map.br_startoff, fsbno = map.br_startblock;
- bno < map.br_startoff + map.br_blockcount;
- bno++, fsbno++) {
- /*
- * Reserve log for one block zeroing.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero,
- 0, 0, 0, &tp);
- if (error)
- return error;
- /*
- * Lock the bitmap inode.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- /*
- * Get a buffer for the block.
- */
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0, &bp);
- if (error)
- goto out_trans_cancel;
+ xfs_rtbitmap_lock(mp);
+ xfs_rtbitmap_trans_join(args.tp);
- xfs_trans_buf_set_type(tp, bp, buf_type);
- bp->b_ops = &xfs_rtbuf_ops;
- memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
- xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
- /*
- * Commit the transaction.
- */
- error = xfs_trans_commit(tp);
- if (error)
- return error;
- }
- /*
- * Go on to the next extent, if any.
- */
- oblocks = map.br_startoff + map.br_blockcount;
+ /*
+ * Update the bitmap inode's size ondisk and incore. We need to update
+ * the incore size so that inode inactivation won't punch what it thinks
+ * are "posteof" blocks.
+ */
+ rbmip->i_disk_size = nmp->m_sb.sb_rbmblocks * nmp->m_sb.sb_blocksize;
+ i_size_write(VFS_I(rbmip), rbmip->i_disk_size);
+ xfs_trans_log_inode(args.tp, rbmip, XFS_ILOG_CORE);
+
+ /*
+ * Update the summary inode's size. We need to update the incore size
+ * so that inode inactivation won't punch what it thinks are "posteof"
+ * blocks.
+ */
+ rsumip->i_disk_size = nmp->m_rsumblocks * nmp->m_sb.sb_blocksize;
+ i_size_write(VFS_I(rsumip), rsumip->i_disk_size);
+ xfs_trans_log_inode(args.tp, rsumip, XFS_ILOG_CORE);
+
+ /*
+ * Copy summary data from old to new sizes when the real size (not
+ * block-aligned) changes.
+ */
+ if (mp->m_sb.sb_rbmblocks != nmp->m_sb.sb_rbmblocks ||
+ mp->m_rsumlevels != nmp->m_rsumlevels) {
+ error = xfs_rtcopy_summary(&args, &nargs);
+ if (error)
+ goto out_cancel;
}
- return 0;
+ /*
+ * Update superblock fields.
+ */
+ if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
+ xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
+ nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
+ if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
+ xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
+ nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
+ if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
+ xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
+ nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
+ if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
+ xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
+ nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
+ if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
+ xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
+ nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
-out_trans_cancel:
- xfs_trans_cancel(tp);
- return error;
-}
+ /*
+ * Free the new extent.
+ */
+ freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
+ error = xfs_rtfree_range(&nargs, mp->m_sb.sb_rextents, freed_rtx);
+ xfs_rtbuf_cache_relse(&nargs);
+ if (error)
+ goto out_cancel;
-static void
-xfs_alloc_rsum_cache(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */
-{
/*
- * The rsum cache is initialized to the maximum value, which is
- * trivially an upper bound on the maximum level with any free extents.
- * We can continue without the cache if it couldn't be allocated.
+ * Mark more blocks free in the superblock.
*/
- mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
- if (mp->m_rsum_cache)
- memset(mp->m_rsum_cache, -1, rbmblocks);
- else
- xfs_warn(mp, "could not allocate realtime summary cache");
+ xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
+
+ /*
+ * Update the calculated values in the real mount structure.
+ */
+ mp->m_rsumlevels = nmp->m_rsumlevels;
+ mp->m_rsumblocks = nmp->m_rsumblocks;
+ xfs_mount_sb_set_rextsize(mp, &mp->m_sb);
+
+ /*
+ * Recompute the growfsrt reservation from the new rsumsize.
+ */
+ xfs_trans_resv_calc(mp, &mp->m_resv);
+
+ error = xfs_trans_commit(args.tp);
+ if (error)
+ goto out_free;
+
+ /*
+ * Ensure the mount RT feature flag is now set.
+ */
+ mp->m_features |= XFS_FEAT_REALTIME;
+
+ kfree(nmp);
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(args.tp);
+out_free:
+ kfree(nmp);
+ return error;
}
/*
- * Visible (exported) functions.
+ * Calculate the last rbmblock currently used.
+ *
+ * This also deals with the case where there were no rtextents before.
*/
+static xfs_fileoff_t
+xfs_last_rt_bmblock(
+ struct xfs_mount *mp)
+{
+ xfs_fileoff_t bmbno = mp->m_sb.sb_rbmblocks;
+
+ /* Skip the current block if it is exactly full. */
+ if (xfs_rtx_to_rbmword(mp, mp->m_sb.sb_rextents) != 0)
+ bmbno--;
+ return bmbno;
+}
/*
* Grow the realtime area of the filesystem.
@@ -799,21 +872,13 @@ xfs_growfs_rt(
xfs_fileoff_t bmbno; /* bitmap block number */
struct xfs_buf *bp; /* temporary buffer */
int error; /* error return value */
- xfs_mount_t *nmp; /* new (fake) mount structure */
- xfs_rfsblock_t nrblocks; /* new number of realtime blocks */
xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
xfs_rtxnum_t nrextents; /* new number of realtime extents */
- uint8_t nrextslog; /* new log2 of sb_rextents */
xfs_extlen_t nrsumblocks; /* new number of summary blocks */
- uint nrsumlevels; /* new rt summary levels */
- uint nrsumsize; /* new size of rt summary, bytes */
- xfs_sb_t *nsbp; /* new superblock */
xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */
xfs_extlen_t rsumblocks; /* current number of rt summary blks */
- xfs_sb_t *sbp; /* old superblock */
uint8_t *rsum_cache; /* old summary cache */
-
- sbp = &mp->m_sb;
+ xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -821,63 +886,69 @@ xfs_growfs_rt(
/* Needs to have been mounted with an rt device. */
if (!XFS_IS_REALTIME_MOUNT(mp))
return -EINVAL;
+
+ if (!mutex_trylock(&mp->m_growlock))
+ return -EWOULDBLOCK;
/*
* Mount should fail if the rt bitmap/summary files don't load, but
* we'll check anyway.
*/
+ error = -EINVAL;
if (!mp->m_rbmip || !mp->m_rsumip)
- return -EINVAL;
+ goto out_unlock;
/* Shrink not supported. */
- if (in->newblocks <= sbp->sb_rblocks)
- return -EINVAL;
-
+ if (in->newblocks <= mp->m_sb.sb_rblocks)
+ goto out_unlock;
/* Can only change rt extent size when adding rt volume. */
- if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize)
- return -EINVAL;
+ if (mp->m_sb.sb_rblocks > 0 && in->extsize != mp->m_sb.sb_rextsize)
+ goto out_unlock;
/* Range check the extent size. */
if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE ||
XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
- return -EINVAL;
+ goto out_unlock;
/* Unsupported realtime features. */
+ error = -EOPNOTSUPP;
if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
- return -EOPNOTSUPP;
+ goto out_unlock;
- nrblocks = in->newblocks;
- error = xfs_sb_validate_fsb_count(sbp, nrblocks);
+ error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
if (error)
- return error;
+ goto out_unlock;
/*
* Read in the last block of the device, make sure it exists.
*/
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
- XFS_FSB_TO_BB(mp, nrblocks - 1),
+ XFS_FSB_TO_BB(mp, in->newblocks - 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
if (error)
- return error;
+ goto out_unlock;
xfs_buf_relse(bp);
/*
* Calculate new parameters. These are the final values to be reached.
*/
- nrextents = nrblocks;
- do_div(nrextents, in->extsize);
- if (!xfs_validate_rtextents(nrextents))
- return -EINVAL;
+ nrextents = div_u64(in->newblocks, in->extsize);
+ if (nrextents == 0) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
- nrextslog = xfs_compute_rextslog(nrextents);
- nrsumlevels = nrextslog + 1;
- nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
- nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+ nrsumblocks = xfs_rtsummary_blockcount(mp,
+ xfs_compute_rextslog(nrextents) + 1, nrbmblocks);
+
/*
* New summary size can't be more than half the size of
* the log. This prevents us from getting a log overflow,
* since we'll log basically the whole summary file at once.
*/
- if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
- return -EINVAL;
+ if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* Get the old block counts for bitmap and summary inodes.
* These can't change since other growfs callers are locked out.
@@ -887,165 +958,41 @@ xfs_growfs_rt(
/*
* Allocate space to the bitmap and summary files, as necessary.
*/
- error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
+ error = xfs_rtfile_initialize_blocks(mp->m_rbmip, rbmblocks,
+ nrbmblocks, NULL);
if (error)
- return error;
- error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
+ goto out_unlock;
+ error = xfs_rtfile_initialize_blocks(mp->m_rsumip, rsumblocks,
+ nrsumblocks, NULL);
if (error)
- return error;
+ goto out_unlock;
rsum_cache = mp->m_rsum_cache;
- if (nrbmblocks != sbp->sb_rbmblocks)
- xfs_alloc_rsum_cache(mp, nrbmblocks);
-
- /*
- * Allocate a new (fake) mount/sb.
- */
- nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL);
- /*
- * Loop over the bitmap blocks.
- * We will do everything one bitmap block at a time.
- * Skip the current block if it is exactly full.
- * This also deals with the case where there were no rtextents before.
- */
- for (bmbno = sbp->sb_rbmblocks -
- ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
- bmbno < nrbmblocks;
- bmbno++) {
- struct xfs_rtalloc_args args = {
- .mp = mp,
- };
- struct xfs_rtalloc_args nargs = {
- .mp = nmp,
- };
- struct xfs_trans *tp;
- xfs_rfsblock_t nrblocks_step;
-
- *nmp = *mp;
- nsbp = &nmp->m_sb;
- /*
- * Calculate new sb and mount fields for this round.
- */
- nsbp->sb_rextsize = in->extsize;
- nmp->m_rtxblklog = -1; /* don't use shift or masking */
- nsbp->sb_rbmblocks = bmbno + 1;
- nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize *
- nsbp->sb_rextsize;
- nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
- nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
- ASSERT(nsbp->sb_rextents != 0);
- nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents);
- nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
- nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
- nsbp->sb_rbmblocks);
- nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
- /* recompute growfsrt reservation from new rsumsize */
- xfs_trans_resv_calc(nmp, &nmp->m_resv);
-
- /*
- * Start a transaction, get the log reservation.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0,
- &tp);
+ if (nrbmblocks != mp->m_sb.sb_rbmblocks) {
+ error = xfs_alloc_rsum_cache(mp, nrbmblocks);
if (error)
- break;
- args.tp = tp;
- nargs.tp = tp;
-
- /*
- * Lock out other callers by grabbing the bitmap and summary
- * inode locks and joining them to the transaction.
- */
- xfs_rtbitmap_lock(tp, mp);
- /*
- * Update the bitmap inode's size ondisk and incore. We need
- * to update the incore size so that inode inactivation won't
- * punch what it thinks are "posteof" blocks.
- */
- mp->m_rbmip->i_disk_size =
- nsbp->sb_rbmblocks * nsbp->sb_blocksize;
- i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_disk_size);
- xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
- /*
- * Update the summary inode's size. We need to update the
- * incore size so that inode inactivation won't punch what it
- * thinks are "posteof" blocks.
- */
- mp->m_rsumip->i_disk_size = nmp->m_rsumsize;
- i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_disk_size);
- xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
- /*
- * Copy summary data from old to new sizes.
- * Do this when the real size (not block-aligned) changes.
- */
- if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
- mp->m_rsumlevels != nmp->m_rsumlevels) {
- error = xfs_rtcopy_summary(&args, &nargs);
- if (error)
- goto error_cancel;
- }
- /*
- * Update superblock fields.
- */
- if (nsbp->sb_rextsize != sbp->sb_rextsize)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
- nsbp->sb_rextsize - sbp->sb_rextsize);
- if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
- nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
- if (nsbp->sb_rblocks != sbp->sb_rblocks)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
- nsbp->sb_rblocks - sbp->sb_rblocks);
- if (nsbp->sb_rextents != sbp->sb_rextents)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
- nsbp->sb_rextents - sbp->sb_rextents);
- if (nsbp->sb_rextslog != sbp->sb_rextslog)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
- nsbp->sb_rextslog - sbp->sb_rextslog);
- /*
- * Free new extent.
- */
- error = xfs_rtfree_range(&nargs, sbp->sb_rextents,
- nsbp->sb_rextents - sbp->sb_rextents);
- xfs_rtbuf_cache_relse(&nargs);
- if (error) {
-error_cancel:
- xfs_trans_cancel(tp);
- break;
- }
- /*
- * Mark more blocks free in the superblock.
- */
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
- nsbp->sb_rextents - sbp->sb_rextents);
- /*
- * Update mp values into the real mp structure.
- */
- mp->m_rsumlevels = nrsumlevels;
- mp->m_rsumsize = nrsumsize;
- /* recompute growfsrt reservation from new rsumsize */
- xfs_trans_resv_calc(mp, &mp->m_resv);
+ goto out_unlock;
+ }
- error = xfs_trans_commit(tp);
+ /* Initialize the free space bitmap one bitmap block at a time. */
+ for (bmbno = xfs_last_rt_bmblock(mp); bmbno < nrbmblocks; bmbno++) {
+ error = xfs_growfs_rt_bmblock(mp, in->newblocks, in->extsize,
+ bmbno);
if (error)
- break;
+ goto out_free;
+ }
- /* Ensure the mount RT feature flag is now set. */
- mp->m_features |= XFS_FEAT_REALTIME;
+ if (old_rextsize != in->extsize) {
+ error = xfs_growfs_rt_fixup_extsize(mp);
+ if (error)
+ goto out_free;
}
- if (error)
- goto out_free;
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
out_free:
/*
- * Free the fake mp structure.
- */
- kfree(nmp);
-
- /*
* If we had to allocate a new rsum_cache, we either need to free the
* old one (if we succeeded) or free the new one and restore the old one
* (if there was an error).
@@ -1059,6 +1006,8 @@ out_free:
}
}
+out_unlock:
+ mutex_unlock(&mp->m_growlock);
return error;
}
@@ -1072,7 +1021,6 @@ xfs_rtmount_init(
struct xfs_buf *bp; /* buffer for last block of subvolume */
struct xfs_sb *sbp; /* filesystem superblock copy in mount */
xfs_daddr_t d; /* address of last block of subvolume */
- unsigned int rsumblocks;
int error;
sbp = &mp->m_sb;
@@ -1084,9 +1032,8 @@ xfs_rtmount_init(
return -ENODEV;
}
mp->m_rsumlevels = sbp->sb_rextslog + 1;
- rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels,
+ mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels,
mp->m_sb.sb_rbmblocks);
- mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
mp->m_rbmip = mp->m_rsumip = NULL;
/*
* Check that the realtime section is an ok size.
@@ -1216,7 +1163,9 @@ xfs_rtmount_inodes(
if (error)
goto out_rele_summary;
- xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
+ error = xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
+ if (error)
+ goto out_rele_summary;
return 0;
out_rele_summary:
@@ -1244,12 +1193,11 @@ xfs_rtunmount_inodes(
* of rtextents and the fraction.
* The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
*/
-static int
+static xfs_rtxnum_t
xfs_rtpick_extent(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
- xfs_rtxlen_t len, /* allocation length (rtextents) */
- xfs_rtxnum_t *pick) /* result rt extent */
+ xfs_rtxlen_t len) /* allocation length (rtextents) */
{
xfs_rtxnum_t b; /* result rtext */
int log2; /* log of sequence number */
@@ -1280,8 +1228,7 @@ xfs_rtpick_extent(
ts.tv_sec = seq + 1;
inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
- *pick = b;
- return 0;
+ return b;
}
static void
@@ -1313,36 +1260,109 @@ xfs_rtalloc_align_minmax(
*raminlen = newminlen;
}
-int
-xfs_bmap_rtalloc(
- struct xfs_bmalloca *ap)
+static int
+xfs_rtallocate(
+ struct xfs_trans *tp,
+ xfs_rtblock_t bno_hint,
+ xfs_rtxlen_t minlen,
+ xfs_rtxlen_t maxlen,
+ xfs_rtxlen_t prod,
+ bool wasdel,
+ bool initial_user_data,
+ bool *rtlocked,
+ xfs_rtblock_t *bno,
+ xfs_extlen_t *blen)
+{
+ struct xfs_rtalloc_args args = {
+ .mp = tp->t_mountp,
+ .tp = tp,
+ };
+ xfs_rtxnum_t start = 0;
+ xfs_rtxnum_t rtx;
+ xfs_rtxlen_t len = 0;
+ int error = 0;
+
+ /*
+ * Lock out modifications to both the RT bitmap and summary inodes.
+ */
+ if (!*rtlocked) {
+ xfs_rtbitmap_lock(args.mp);
+ xfs_rtbitmap_trans_join(tp);
+ *rtlocked = true;
+ }
+
+ /*
+ * For an allocation to an empty file at offset 0, pick an extent that
+ * will space things out in the rt area.
+ */
+ if (bno_hint)
+ start = xfs_rtb_to_rtx(args.mp, bno_hint);
+ else if (initial_user_data)
+ start = xfs_rtpick_extent(args.mp, tp, maxlen);
+
+ if (start) {
+ error = xfs_rtallocate_extent_near(&args, start, minlen, maxlen,
+ &len, prod, &rtx);
+ /*
+ * If we can't allocate near a specific rt extent, try again
+ * without locality criteria.
+ */
+ if (error == -ENOSPC) {
+ xfs_rtbuf_cache_relse(&args);
+ error = 0;
+ }
+ }
+
+ if (!error) {
+ error = xfs_rtallocate_extent_size(&args, minlen, maxlen, &len,
+ prod, &rtx);
+ }
+
+ if (error)
+ goto out_release;
+
+ error = xfs_rtallocate_range(&args, rtx, len);
+ if (error)
+ goto out_release;
+
+ xfs_trans_mod_sb(tp, wasdel ?
+ XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS,
+ -(long)len);
+ *bno = xfs_rtx_to_rtb(args.mp, rtx);
+ *blen = xfs_rtxlen_to_extlen(args.mp, len);
+
+out_release:
+ xfs_rtbuf_cache_relse(&args);
+ return error;
+}
+
+static int
+xfs_rtallocate_align(
+ struct xfs_bmalloca *ap,
+ xfs_rtxlen_t *ralen,
+ xfs_rtxlen_t *raminlen,
+ xfs_rtxlen_t *prod,
+ bool *noalign)
{
struct xfs_mount *mp = ap->ip->i_mount;
xfs_fileoff_t orig_offset = ap->offset;
- xfs_rtxnum_t start; /* allocation hint rtextent no */
- xfs_rtxnum_t rtx; /* actually allocated rtextent no */
- xfs_rtxlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_rtxlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_extlen_t orig_length = ap->length;
xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
- xfs_rtxlen_t raminlen;
- bool rtlocked = false;
- bool ignore_locality = false;
- struct xfs_rtalloc_args args = {
- .mp = mp,
- .tp = ap->tp,
- };
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t mod; /* product factor for allocators */
int error;
- align = xfs_get_extsz_hint(ap->ip);
- if (!align)
- align = 1;
-retry:
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 1, ap->eof, 0,
- ap->conv, &ap->offset, &ap->length);
+ if (*noalign) {
+ align = mp->m_sb.sb_rextsize;
+ } else {
+ align = xfs_get_extsz_hint(ap->ip);
+ if (!align)
+ align = 1;
+ if (align == mp->m_sb.sb_rextsize)
+ *noalign = true;
+ }
+
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1,
+ ap->eof, 0, ap->conv, &ap->offset, &ap->length);
if (error)
return error;
ASSERT(ap->length);
@@ -1366,59 +1386,55 @@ retry:
* XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
- raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
- ASSERT(raminlen > 0);
- ASSERT(raminlen <= ralen);
-
- /*
- * Lock out modifications to both the RT bitmap and summary inodes
- */
- if (!rtlocked) {
- xfs_rtbitmap_lock(ap->tp, mp);
- rtlocked = true;
- }
-
- if (ignore_locality) {
- start = 0;
- } else if (xfs_bmap_adjacent(ap)) {
- start = xfs_rtb_to_rtx(mp, ap->blkno);
- } else if (ap->datatype & XFS_ALLOC_INITIAL_USER_DATA) {
- /*
- * If it's an allocation to an empty file at offset 0, pick an
- * extent that will space things out in the rt area.
- */
- error = xfs_rtpick_extent(mp, ap->tp, ralen, &start);
- if (error)
- return error;
- } else {
- start = 0;
- }
+ *ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
+ *raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+ ASSERT(*raminlen > 0);
+ ASSERT(*raminlen <= *ralen);
/*
* Only bother calculating a real prod factor if offset & length are
* perfectly aligned, otherwise it will just get us in trouble.
*/
div_u64_rem(ap->offset, align, &mod);
- if (mod || ap->length % align) {
- prod = 1;
- } else {
- prod = xfs_extlen_to_rtxlen(mp, align);
- if (prod > 1)
- xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod);
- }
+ if (mod || ap->length % align)
+ *prod = 1;
+ else
+ *prod = xfs_extlen_to_rtxlen(mp, align);
- if (start) {
- error = xfs_rtallocate_extent_near(&args, start, raminlen,
- ralen, &ralen, prod, &rtx);
- } else {
- error = xfs_rtallocate_extent_size(&args, raminlen,
- ralen, &ralen, prod, &rtx);
- }
- xfs_rtbuf_cache_relse(&args);
+ if (*prod > 1)
+ xfs_rtalloc_align_minmax(raminlen, ralen, prod);
+ return 0;
+}
+int
+xfs_bmap_rtalloc(
+ struct xfs_bmalloca *ap)
+{
+ xfs_fileoff_t orig_offset = ap->offset;
+ xfs_rtxlen_t prod = 0; /* product factor for allocators */
+ xfs_rtxlen_t ralen = 0; /* realtime allocation length */
+ xfs_rtblock_t bno_hint = NULLRTBLOCK;
+ xfs_extlen_t orig_length = ap->length;
+ xfs_rtxlen_t raminlen;
+ bool rtlocked = false;
+ bool noalign = false;
+ bool initial_user_data =
+ ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
+ int error;
+
+retry:
+ error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
+ if (error)
+ return error;
+
+ if (xfs_bmap_adjacent(ap))
+ bno_hint = ap->blkno;
+
+ error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod,
+ ap->wasdel, initial_user_data, &rtlocked,
+ &ap->blkno, &ap->length);
if (error == -ENOSPC) {
- if (align > mp->m_sb.sb_rextsize) {
+ if (!noalign) {
/*
* We previously enlarged the request length to try to
* satisfy an extent size hint. The allocator didn't
@@ -1428,16 +1444,7 @@ retry:
*/
ap->offset = orig_offset;
ap->length = orig_length;
- minlen = align = mp->m_sb.sb_rextsize;
- goto retry;
- }
-
- if (!ignore_locality && start != 0) {
- /*
- * If we can't allocate near a specific rt extent, try
- * again without locality criteria.
- */
- ignore_locality = true;
+ noalign = true;
goto retry;
}
@@ -1448,11 +1455,6 @@ retry:
if (error)
return error;
- xfs_trans_mod_sb(ap->tp, ap->wasdel ?
- XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS,
- -(long)ralen);
- ap->blkno = xfs_rtx_to_rtb(mp, rtx);
- ap->length = xfs_rtxlen_to_extlen(mp, ralen);
xfs_bmap_alloc_account(ap);
return 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 27e9f749c4c7..26767745d9fd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -311,9 +311,9 @@ xfs_set_inode_alloc(
* the allocator to accommodate the request.
*/
if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32)
- set_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
+ xfs_set_inode32(mp);
else
- clear_bit(XFS_OPSTATE_INODE32, &mp->m_opstate);
+ xfs_clear_inode32(mp);
for (index = 0; index < agcount; index++) {
struct xfs_perag *pag;
@@ -1511,7 +1511,7 @@ xfs_fs_fill_super(
* the newer fsopen/fsconfig API.
*/
if (fc->sb_flags & SB_RDONLY)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ xfs_set_readonly(mp);
if (fc->sb_flags & SB_DIRSYNC)
mp->m_features |= XFS_FEAT_DIRSYNC;
if (fc->sb_flags & SB_SYNCHRONOUS)
@@ -1820,7 +1820,7 @@ xfs_remount_rw(
return -EINVAL;
}
- clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ xfs_clear_readonly(mp);
/*
* If this is the first remount to writeable state we might have some
@@ -1908,7 +1908,7 @@ xfs_remount_ro(
xfs_save_resvblks(mp);
xfs_log_clean(mp);
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ xfs_set_readonly(mp);
return 0;
}
@@ -2009,8 +2009,7 @@ static int xfs_init_fs_context(
return -ENOMEM;
spin_lock_init(&mp->m_sb_lock);
- INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
- spin_lock_init(&mp->m_perag_lock);
+ xa_init(&mp->m_perags);
mutex_init(&mp->m_growlock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 77f19e2f66e0..4252b07cd251 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -165,7 +165,7 @@ xfs_symlink(
/*
* Allocate an inode for the symlink.
*/
- error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
+ error = xfs_dialloc(&tp, &args, &ino);
if (!error)
error = xfs_icreate(tp, ino, &args, &du.ip);
if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 180ce697305a..ee9f0b1f548d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -210,14 +210,14 @@ DEFINE_EVENT(xfs_perag_class, name, \
TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \
TP_ARGS(pag, caller_ip))
DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_hold);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
DEFINE_PERAG_REF_EVENT(xfs_perag_grab);
-DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_grab_next_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_rele);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+DEFINE_PERAG_REF_EVENT(xfs_reclaim_inodes_count);
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
@@ -4926,7 +4926,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
{ XFS_EXCHANGE_RANGE_DRY_RUN, "DRY_RUN" }, \
{ XFS_EXCHANGE_RANGE_FILE1_WRITTEN, "F1_WRITTEN" }, \
{ __XFS_EXCHANGE_RANGE_UPD_CMTIME1, "CMTIME1" }, \
- { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }
+ { __XFS_EXCHANGE_RANGE_UPD_CMTIME2, "CMTIME2" }, \
+ { __XFS_EXCHANGE_RANGE_CHECK_FRESH2, "FRESH2" }
/* file exchange-range tracepoint class */
DECLARE_EVENT_CLASS(xfs_exchrange_class,
@@ -4986,6 +4987,60 @@ DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
+TRACE_EVENT(xfs_exchrange_freshness,
+ TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2),
+ TP_ARGS(fxr, ip2),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ip2_ino)
+ __field(long long, ip2_mtime)
+ __field(long long, ip2_ctime)
+ __field(int, ip2_mtime_nsec)
+ __field(int, ip2_ctime_nsec)
+
+ __field(xfs_ino_t, file2_ino)
+ __field(long long, file2_mtime)
+ __field(long long, file2_ctime)
+ __field(int, file2_mtime_nsec)
+ __field(int, file2_ctime_nsec)
+ ),
+ TP_fast_assign(
+ struct timespec64 ts64;
+ struct inode *inode2 = VFS_I(ip2);
+
+ __entry->dev = inode2->i_sb->s_dev;
+ __entry->ip2_ino = ip2->i_ino;
+
+ ts64 = inode_get_ctime(inode2);
+ __entry->ip2_ctime = ts64.tv_sec;
+ __entry->ip2_ctime_nsec = ts64.tv_nsec;
+
+ ts64 = inode_get_mtime(inode2);
+ __entry->ip2_mtime = ts64.tv_sec;
+ __entry->ip2_mtime_nsec = ts64.tv_nsec;
+
+ __entry->file2_ino = fxr->file2_ino;
+ __entry->file2_mtime = fxr->file2_mtime.tv_sec;
+ __entry->file2_ctime = fxr->file2_ctime.tv_sec;
+ __entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec;
+ __entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec;
+ ),
+ TP_printk("dev %d:%d "
+ "ino 0x%llx mtime %lld:%d ctime %lld:%d -> "
+ "file 0x%llx mtime %lld:%d ctime %lld:%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ip2_ino,
+ __entry->ip2_mtime,
+ __entry->ip2_mtime_nsec,
+ __entry->ip2_ctime,
+ __entry->ip2_ctime_nsec,
+ __entry->file2_ino,
+ __entry->file2_mtime,
+ __entry->file2_mtime_nsec,
+ __entry->file2_ctime,
+ __entry->file2_ctime_nsec)
+);
+
TRACE_EVENT(xfs_exchmaps_overhead,
TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
unsigned long long rmapbt_blocks),
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0fafcc9f3dbe..8ede9d099d1f 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -644,7 +644,12 @@ xfsaild(
set_freezable();
while (1) {
- if (tout)
+ /*
+ * Long waits of 50ms or more occur when we've run out of items
+ * to push, so we only want uninterruptible state if we're
+ * actually blocked on something.
+ */
+ if (tout && tout <= 20)
set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
else
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);