diff options
Diffstat (limited to 'Documentation/filesystems')
| -rw-r--r-- | Documentation/filesystems/Locking | 45 | ||||
| -rw-r--r-- | Documentation/filesystems/btrfs.txt | 180 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4.txt | 21 | ||||
| -rw-r--r-- | Documentation/filesystems/f2fs.txt | 13 | ||||
| -rw-r--r-- | Documentation/filesystems/jfs.txt | 2 | ||||
| -rw-r--r-- | Documentation/filesystems/nfs/00-INDEX | 2 | ||||
| -rw-r--r-- | Documentation/filesystems/nfs/rpc-server-gss.txt | 91 | ||||
| -rw-r--r-- | Documentation/filesystems/porting | 10 | ||||
| -rw-r--r-- | Documentation/filesystems/proc.txt | 7 | ||||
| -rw-r--r-- | Documentation/filesystems/qnx6.txt | 2 | ||||
| -rw-r--r-- | Documentation/filesystems/vfat.txt | 28 | ||||
| -rw-r--r-- | Documentation/filesystems/vfs.txt | 97 | ||||
| -rw-r--r-- | Documentation/filesystems/xfs-self-describing-metadata.txt | 350 | ||||
| -rw-r--r-- | Documentation/filesystems/xfs.txt | 3 |
14 files changed, 784 insertions, 67 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f48e0c6b4c42..fe7afe225381 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -10,10 +10,9 @@ be able to use diff(1). --------------------------- dentry_operations -------------------------- prototypes: int (*d_revalidate)(struct dentry *, unsigned int); - int (*d_hash)(const struct dentry *, const struct inode *, - struct qstr *); - int (*d_compare)(const struct dentry *, const struct inode *, - const struct dentry *, const struct inode *, + int (*d_weak_revalidate)(struct dentry *, unsigned int); + int (*d_hash)(const struct dentry *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); @@ -25,6 +24,7 @@ prototypes: locking rules: rename_lock ->d_lock may block rcu-walk d_revalidate: no no yes (ref-walk) maybe +d_weak_revalidate:no no yes no d_hash no no no maybe d_compare: yes no no maybe d_delete: no yes no no @@ -64,6 +64,7 @@ prototypes: int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); + int (*tmpfile) (struct inode *, struct dentry *, umode_t); locking rules: all may block @@ -91,6 +92,7 @@ removexattr: yes fiemap: no update_time: no atomic_open: yes +tmpfile: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. @@ -187,7 +189,7 @@ prototypes: loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); - int (*invalidatepage) (struct page *, unsigned long); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, @@ -308,8 +310,8 @@ filesystems and by the swapper. The latter will eventually go away. Please, keep it that way and don't breed new callers. ->invalidatepage() is called when the filesystem must attempt to drop -some or all of the buffers from the page when it is being truncated. It -returns zero on success. If ->invalidatepage is zero, the kernel uses +some or all of the buffers from the page when it is being truncated. It +returns zero on success. If ->invalidatepage is zero, the kernel uses block_invalidatepage() instead. ->releasepage() is called when the kernel is about to try to drop the @@ -342,25 +344,38 @@ prototypes: locking rules: - file_lock_lock may block + inode->i_lock may block fl_copy_lock: yes no fl_release_private: maybe no ----------------------- lock_manager_operations --------------------------- prototypes: int (*lm_compare_owner)(struct file_lock *, struct file_lock *); + unsigned long (*lm_owner_key)(struct file_lock *); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, struct file_lock *, int); void (*lm_break)(struct file_lock *); /* break_lease callback */ int (*lm_change)(struct file_lock **, int); locking rules: - file_lock_lock may block -lm_compare_owner: yes no -lm_notify: yes no -lm_grant: no no -lm_break: yes no -lm_change yes no + + inode->i_lock blocked_lock_lock may block +lm_compare_owner: yes[1] maybe no +lm_owner_key yes[1] yes no +lm_notify: yes yes no +lm_grant: no no no +lm_break: yes no no +lm_change yes no no + +[1]: ->lm_compare_owner and ->lm_owner_key are generally called with +*an* inode->i_lock held. It may not be the i_lock of the inode +associated with either file_lock argument! This is the case with deadlock +detection, since the code has to chase down the owners of locks that may +be entirely unrelated to the one on which the lock is being acquired. +For deadlock detection however, the blocked_lock_lock is also held. The +fact that these locks are held ensures that the file_locks do not +disappear out from under you while doing the comparison or generating an +owner key. --------------------------- buffer_head ----------------------------------- prototypes: @@ -412,7 +427,7 @@ prototypes: ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - int (*readdir) (struct file *, void *, filldir_t); + int (*iterate) (struct file *, struct dir_context *); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt index 7671352216f1..b349d57b76ea 100644 --- a/Documentation/filesystems/btrfs.txt +++ b/Documentation/filesystems/btrfs.txt @@ -1,8 +1,8 @@ - BTRFS - ===== +BTRFS +===== -Btrfs is a new copy on write filesystem for Linux aimed at +Btrfs is a copy on write filesystem for Linux aimed at implementing advanced features while focusing on fault tolerance, repair and easy administration. Initially developed by Oracle, Btrfs is licensed under the GPL and open for contribution from anyone. @@ -34,9 +34,175 @@ The main Btrfs features include: * Online filesystem defragmentation +Mount Options +============= - MAILING LIST - ============ +When mounting a btrfs filesystem, the following option are accepted. +Unless otherwise specified, all options default to off. + + alloc_start=<bytes> + Debugging option to force all block allocations above a certain + byte threshold on each block device. The value is specified in + bytes, optionally with a K, M, or G suffix, case insensitive. + Default is 1MB. + + autodefrag + Detect small random writes into files and queue them up for the + defrag process. Works best for small files; Not well suited for + large database workloads. + + check_int + check_int_data + check_int_print_mask=<value> + These debugging options control the behavior of the integrity checking + module (the BTRFS_FS_CHECK_INTEGRITY config option required). + + check_int enables the integrity checker module, which examines all + block write requests to ensure on-disk consistency, at a large + memory and CPU cost. + + check_int_data includes extent data in the integrity checks, and + implies the check_int option. + + check_int_print_mask takes a bitmask of BTRFSIC_PRINT_MASK_* values + as defined in fs/btrfs/check-integrity.c, to control the integrity + checker module behavior. + + See comments at the top of fs/btrfs/check-integrity.c for more info. + + compress + compress=<type> + compress-force + compress-force=<type> + Control BTRFS file data compression. Type may be specified as "zlib" + "lzo" or "no" (for no compression, used for remounting). If no type + is specified, zlib is used. If compress-force is specified, + all files will be compressed, whether or not they compress well. + If compression is enabled, nodatacow and nodatasum are disabled. + + degraded + Allow mounts to continue with missing devices. A read-write mount may + fail with too many devices missing, for example if a stripe member + is completely missing. + + device=<devicepath> + Specify a device during mount so that ioctls on the control device + can be avoided. Especialy useful when trying to mount a multi-device + setup as root. May be specified multiple times for multiple devices. + + discard + Issue frequent commands to let the block device reclaim space freed by + the filesystem. This is useful for SSD devices, thinly provisioned + LUNs and virtual machine images, but may have a significant + performance impact. (The fstrim command is also available to + initiate batch trims from userspace). + + enospc_debug + Debugging option to be more verbose in some ENOSPC conditions. + + fatal_errors=<action> + Action to take when encountering a fatal error: + "bug" - BUG() on a fatal error. This is the default. + "panic" - panic() on a fatal error. + + flushoncommit + The 'flushoncommit' mount option forces any data dirtied by a write in a + prior transaction to commit as part of the current commit. This makes + the committed state a fully consistent view of the file system from the + application's perspective (i.e., it includes all completed file system + operations). This was previously the behavior only when a snapshot is + created. + + inode_cache + Enable free inode number caching. Defaults to off due to an overflow + problem when the free space crcs don't fit inside a single page. + + max_inline=<bytes> + Specify the maximum amount of space, in bytes, that can be inlined in + a metadata B-tree leaf. The value is specified in bytes, optionally + with a K, M, or G suffix, case insensitive. In practice, this value + is limited by the root sector size, with some space unavailable due + to leaf headers. For a 4k sectorsize, max inline data is ~3900 bytes. + + metadata_ratio=<value> + Specify that 1 metadata chunk should be allocated after every <value> + data chunks. Off by default. + + noacl + Disable support for Posix Access Control Lists (ACLs). See the + acl(5) manual page for more information about ACLs. + + nobarrier + Disables the use of block layer write barriers. Write barriers ensure + that certain IOs make it through the device cache and are on persistent + storage. If used on a device with a volatile (non-battery-backed) + write-back cache, this option will lead to filesystem corruption on a + system crash or power loss. + + nodatacow + Disable data copy-on-write for newly created files. Implies nodatasum, + and disables all compression. + + nodatasum + Disable data checksumming for newly created files. + + notreelog + Disable the tree logging used for fsync and O_SYNC writes. + + recovery + Enable autorecovery attempts if a bad tree root is found at mount time. + Currently this scans a list of several previous tree roots and tries to + use the first readable. + + skip_balance + Skip automatic resume of interrupted balance operation after mount. + May be resumed with "btrfs balance resume." + + space_cache (*) + Enable the on-disk freespace cache. + nospace_cache + Disable freespace cache loading without clearing the cache. + clear_cache + Force clearing and rebuilding of the disk space cache if something + has gone wrong. + + ssd + nossd + ssd_spread + Options to control ssd allocation schemes. By default, BTRFS will + enable or disable ssd allocation heuristics depending on whether a + rotational or nonrotational disk is in use. The ssd and nossd options + can override this autodetection. + + The ssd_spread mount option attempts to allocate into big chunks + of unused space, and may perform better on low-end ssds. ssd_spread + implies ssd, enabling all other ssd heuristics as well. + + subvol=<path> + Mount subvolume at <path> rather than the root subvolume. <path> is + relative to the top level subvolume. + + subvolid=<ID> + Mount subvolume specified by an ID number rather than the root subvolume. + This allows mounting of subvolumes which are not in the root of the mounted + filesystem. + You can use "btrfs subvolume list" to see subvolume ID numbers. + + subvolrootid=<objectid> (deprecated) + Mount subvolume specified by <objectid> rather than the root subvolume. + This allows mounting of subvolumes which are not in the root of the mounted + filesystem. + You can use "btrfs subvolume show " to see the object ID for a subvolume. + + thread_pool=<number> + The number of worker threads to allocate. The default number is equal + to the number of CPUs + 2, or 8, whichever is smaller. + + user_subvol_rm_allowed + Allow subvolumes to be deleted by a non-root user. Use with caution. + +MAILING LIST +============ There is a Btrfs mailing list hosted on vger.kernel.org. You can find details on how to subscribe here: @@ -49,8 +215,8 @@ http://dir.gmane.org/gmane.comp.file-systems.btrfs - IRC - === +IRC +=== Discussion of Btrfs also occurs on the #btrfs channel of the Freenode IRC network. diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 34ea4f1fa6ea..f7cbf574a875 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname> session_write_kbytes This file is read-only and shows the number of kilobytes of data that have been written to this filesystem since it was mounted. + + reserved_clusters This is RW file and contains number of reserved + clusters in the file system which will be used + in the specific situations to avoid costly + zeroout, unexpected ENOSPC, or possible data + loss. The default is 2% or 4096 clusters, + whichever is smaller and this can be changed + however it can never exceed number of clusters + in the file system. If there is not enough space + for the reserved space when mounting the file + mount will _not_ fail. .............................................................................. Ioctls @@ -587,6 +598,16 @@ Table of Ext4 specific ioctls bitmaps and inode table, the userspace tool thus just passes the new number of blocks. +EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes + (like i_blocks, i_size, i_flags, ...) from + the specified inode with inode + EXT4_BOOT_LOADER_INO (#5). This is typically + used to store a boot loader in a secure part of + the filesystem, where it can't be changed by a + normal user by accident. + The data blocks of the previous boot loader + will be associated with the given inode. + .............................................................................. References diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index dcf338e62b71..b91e2f26b672 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -98,8 +98,13 @@ Cleaning Overhead MOUNT OPTIONS ================================================================================ -background_gc_off Turn off cleaning operations, namely garbage collection, - triggered in background when I/O subsystem is idle. +background_gc=%s Turn on/off cleaning operations, namely garbage + collection, triggered in background when I/O subsystem is + idle. If background_gc=on, it will turn on the garbage + collection and if background_gc=off, garbage collection + will be truned off. + Default value for this option is on. So garbage + collection is on by default. disable_roll_forward Disable the roll-forward recovery routine discard Issue discard/TRIM commands when a segment is cleaned. no_heap Disable heap-style segment allocation which finds free @@ -146,7 +151,7 @@ USAGE Format options -------------- --l [label] : Give a volume label, up to 256 unicode name. +-l [label] : Give a volume label, up to 512 unicode name. -a [0 or 1] : Split start location of each area for heap-based allocation. 1 is set by default, which performs this. -o [int] : Set overprovision ratio in percent over volume size. @@ -156,6 +161,8 @@ Format options -z [int] : Set the number of sections per zone. 1 is set by default. -e [str] : Set basic extension list. e.g. "mp3,gif,mov" +-t [0 or 1] : Disable discard command or not. + 1 is set by default, which conducts discard. ================================================================================ DESIGN diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt index f7433355394a..41fd757997b3 100644 --- a/Documentation/filesystems/jfs.txt +++ b/Documentation/filesystems/jfs.txt @@ -42,7 +42,7 @@ nodiscard(*) block device when blocks are freed. This is useful for SSD devices and sparse/thinly-provisioned LUNs. The FITRIM ioctl command is also available together with the nodiscard option. The value of minlen specifies the minimum blockcount, when - a TRIM command to the block device is considered usefull. + a TRIM command to the block device is considered useful. When no value is given to the discard option, it defaults to 64 blocks, which means 256KiB in JFS. The minlen value of discard overrides the minlen value given diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX index 1716874a651e..66eb6c8c5334 100644 --- a/Documentation/filesystems/nfs/00-INDEX +++ b/Documentation/filesystems/nfs/00-INDEX @@ -20,3 +20,5 @@ rpc-cache.txt - introduction to the caching mechanisms in the sunrpc layer. idmapper.txt - information for configuring request-keys to be used by idmapper +knfsd-rpcgss.txt + - Information on GSS authentication support in the NFS Server diff --git a/Documentation/filesystems/nfs/rpc-server-gss.txt b/Documentation/filesystems/nfs/rpc-server-gss.txt new file mode 100644 index 000000000000..716f4be8e8b3 --- /dev/null +++ b/Documentation/filesystems/nfs/rpc-server-gss.txt @@ -0,0 +1,91 @@ + +rpcsec_gss support for kernel RPC servers +========================================= + +This document gives references to the standards and protocols used to +implement RPCGSS authentication in kernel RPC servers such as the NFS +server and the NFS client's NFSv4.0 callback server. (But note that +NFSv4.1 and higher don't require the client to act as a server for the +purposes of authentication.) + +RPCGSS is specified in a few IETF documents: + - RFC2203 v1: http://tools.ietf.org/rfc/rfc2203.txt + - RFC5403 v2: http://tools.ietf.org/rfc/rfc5403.txt +and there is a 3rd version being proposed: + - http://tools.ietf.org/id/draft-williams-rpcsecgssv3.txt + (At draft n. 02 at the time of writing) + +Background +---------- + +The RPCGSS Authentication method describes a way to perform GSSAPI +Authentication for NFS. Although GSSAPI is itself completely mechanism +agnostic, in many cases only the KRB5 mechanism is supported by NFS +implementations. + +The Linux kernel, at the moment, supports only the KRB5 mechanism, and +depends on GSSAPI extensions that are KRB5 specific. + +GSSAPI is a complex library, and implementing it completely in kernel is +unwarranted. However GSSAPI operations are fundementally separable in 2 +parts: +- initial context establishment +- integrity/privacy protection (signing and encrypting of individual + packets) + +The former is more complex and policy-independent, but less +performance-sensitive. The latter is simpler and needs to be very fast. + +Therefore, we perform per-packet integrity and privacy protection in the +kernel, but leave the initial context establishment to userspace. We +need upcalls to request userspace to perform context establishment. + +NFS Server Legacy Upcall Mechanism +---------------------------------- + +The classic upcall mechanism uses a custom text based upcall mechanism +to talk to a custom daemon called rpc.svcgssd that is provide by the +nfs-utils package. + +This upcall mechanism has 2 limitations: + +A) It can handle tokens that are no bigger than 2KiB + +In some Kerberos deployment GSSAPI tokens can be quite big, up and +beyond 64KiB in size due to various authorization extensions attacked to +the Kerberos tickets, that needs to be sent through the GSS layer in +order to perform context establishment. + +B) It does not properly handle creds where the user is member of more +than a few housand groups (the current hard limit in the kernel is 65K +groups) due to limitation on the size of the buffer that can be send +back to the kernel (4KiB). + +NFS Server New RPC Upcall Mechanism +----------------------------------- + +The newer upcall mechanism uses RPC over a unix socket to a daemon +called gss-proxy, implemented by a userspace program called Gssproxy. + +The gss_proxy RPC protocol is currently documented here: + + https://fedorahosted.org/gss-proxy/wiki/ProtocolDocumentation + +This upcall mechanism uses the kernel rpc client and connects to the gssproxy +userspace program over a regular unix socket. The gssproxy protocol does not +suffer from the size limitations of the legacy protocol. + +Negotiating Upcall Mechanisms +----------------------------- + +To provide backward compatibility, the kernel defaults to using the +legacy mechanism. To switch to the new mechanism, gss-proxy must bind +to /var/run/gssproxy.sock and then write "1" to +/proc/net/rpc/use-gss-proxy. If gss-proxy dies, it must repeat both +steps. + +Once the upcall mechanism is chosen, it cannot be changed. To prevent +locking into the legacy mechanisms, the above steps must be performed +before starting nfsd. Whoever starts nfsd can guarantee this by reading +from /proc/net/rpc/use-gss-proxy and checking that it contains a +"1"--the read will block until gss-proxy has done its write to the file. diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 0472c31c163b..206a1bdc7321 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -441,3 +441,13 @@ d_make_root() drops the reference to inode if dentry allocation fails. two, it gets "is it an O_EXCL or equivalent?" boolean argument. Note that local filesystems can ignore tha argument - they are guaranteed that the object doesn't exist. It's remote/distributed ones that might care... +-- +[mandatory] + FS_REVAL_DOT is gone; if you used to have it, add ->d_weak_revalidate() +in your dentry operations instead. +-- +[mandatory] + vfs_readdir() is gone; switch to iterate_dir() instead +-- +[mandatory] + ->readdir() is gone now; switch to ->iterate() diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fd8d0d594fc7..fcc22c982a25 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -473,7 +473,8 @@ This file is only present if the CONFIG_MMU kernel configuration option is enabled. The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG -bits on both physical and virtual pages associated with a process. +bits on both physical and virtual pages associated with a process, and the +soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details). To clear the bits for all the pages associated with the process > echo 1 > /proc/PID/clear_refs @@ -482,6 +483,10 @@ To clear the bits for the anonymous pages associated with the process To clear the bits for the file mapped pages associated with the process > echo 3 > /proc/PID/clear_refs + +To clear the soft-dirty bit + > echo 4 > /proc/PID/clear_refs + Any other value written to /proc/PID/clear_refs will have no effect. The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt index e59f2f09f56e..99e90184a72f 100644 --- a/Documentation/filesystems/qnx6.txt +++ b/Documentation/filesystems/qnx6.txt @@ -148,7 +148,7 @@ smaller than addressing space in the bitmap. Bitmap system area ------------------ -The bitmap itself is devided into three parts. +The bitmap itself is divided into three parts. First the system area, that is split into two halfs. Then userspace. diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index d230dd9c99b0..aa1f459fa6cf 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -150,12 +150,28 @@ discard -- If set, issues discard/TRIM commands to the block device when blocks are freed. This is useful for SSD devices and sparse/thinly-provisoned LUNs. -nfs -- This option maintains an index (cache) of directory - inodes by i_logstart which is used by the nfs-related code to - improve look-ups. +nfs=stale_rw|nostale_ro + Enable this only if you want to export the FAT filesystem + over NFS. + + stale_rw: This option maintains an index (cache) of directory + inodes by i_logstart which is used by the nfs-related code to + improve look-ups. Full file operations (read/write) over NFS is + supported but with cache eviction at NFS server, this could + result in ESTALE issues. + + nostale_ro: This option bases the inode number and filehandle + on the on-disk location of a file in the MS-DOS directory entry. + This ensures that ESTALE will not be returned after a file is + evicted from the inode cache. However, it means that operations + such as rename, create and unlink could cause filehandles that + previously pointed at one file to point at a different file, + potentially causing data corruption. For this reason, this + option also mounts the filesystem readonly. + + To maintain backward compatibility, '-o nfs' is also accepted, + defaulting to stale_rw - Enable this only if you want to export the FAT filesystem - over NFS <bool>: 0,1,yes,no,true,false @@ -291,7 +307,7 @@ the following: <proceeding files...> <slot #3, id = 0x43, characters = "h is long"> - <slot #2, id = 0x02, characters = "xtension whic"> + <slot #2, id = 0x02, characters = "xtension which"> <slot #1, id = 0x01, characters = "My Big File.E"> <directory entry, name = "MYBIGFIL.EXT"> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index e3869098163e..f93a88250a44 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -360,6 +360,8 @@ struct inode_operations { int (*removexattr) (struct dentry *, const char *); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, + int (*tmpfile) (struct inode *, struct dentry *, umode_t); +} ____cacheline_aligned; struct file *, unsigned open_flag, umode_t create_mode, int *opened); }; @@ -472,6 +474,9 @@ otherwise noted. component is negative or needs lookup. Cached positive dentries are still handled by f_op->open(). + tmpfile: called in the end of O_TMPFILE open(). Optional, equivalent to + atomically creating, opening and unlinking a file in given directory. + The Address Space Object ======================== @@ -549,12 +554,11 @@ struct address_space_operations ------------------------------- This describes how the VFS can manipulate mapping of a file to page cache in -your filesystem. As of kernel 2.6.22, the following members are defined: +your filesystem. The following members are defined: struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); - int (*sync_page)(struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); int (*readpages)(struct file *filp, struct address_space *mapping, @@ -566,7 +570,7 @@ struct address_space_operations { loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); - int (*invalidatepage) (struct page *, unsigned long); + void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, @@ -576,6 +580,9 @@ struct address_space_operations { /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); int (*launder_page) (struct page *); + int (*is_partially_uptodate) (struct page *, read_descriptor_t *, + unsigned long); + void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page) (struct mapping *mapping, struct page *page); int (*swap_activate)(struct file *); int (*swap_deactivate)(struct file *); @@ -607,13 +614,6 @@ struct address_space_operations { In this case, the page will be relocated, relocked and if that all succeeds, ->readpage will be called again. - sync_page: called by the VM to notify the backing store to perform all - queued I/O operations for a page. I/O operations for other pages - associated with this address_space object may also be performed. - - This function is optional and is called only for pages with - PG_Writeback set while waiting for the writeback to complete. - writepages: called by the VM to write out pages associated with the address_space object. If wbc->sync_mode is WBC_SYNC_ALL, then the writeback_control will specify a range of pages that must be @@ -685,14 +685,14 @@ struct address_space_operations { invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed from the address space. This generally corresponds to either a - truncation or a complete invalidation of the address space - (in the latter case 'offset' will always be 0). - Any private data associated with the page should be updated - to reflect this truncation. If offset is 0, then - the private data should be released, because the page - must be able to be completely discarded. This may be done by - calling the ->releasepage function, but in this case the - release MUST succeed. + truncation, punch hole or a complete invalidation of the address + space (in the latter case 'offset' will always be 0 and 'length' + will be PAGE_CACHE_SIZE). Any private data associated with the page + should be updated to reflect this truncation. If offset is 0 and + length is PAGE_CACHE_SIZE, then the private data should be released, + because the page must be able to be completely discarded. This may + be done by calling the ->releasepage function, but in this case the + release MUST succeed. releasepage: releasepage is called on PagePrivate pages to indicate that the page should be freed if possible. ->releasepage @@ -742,6 +742,20 @@ struct address_space_operations { prevent redirtying the page, it is kept locked during the whole operation. + is_partially_uptodate: Called by the VM when reading a file through the + pagecache when the underlying blocksize != pagesize. If the required + block is up to date then the read can complete without needing the IO + to bring the whole page up to date. + + is_dirty_writeback: Called by the VM when attempting to reclaim a page. + The VM uses dirty and writeback information to determine if it needs + to stall to allow flushers a chance to complete some IO. Ordinarily + it can use PageDirty and PageWriteback but some filesystems have + more complex state (unstable pages in NFS prevent reclaim) or + do not set those flags due to locking problems (jbd). This callback + allows a filesystem to indicate to the VM if a page should be + treated as dirty or writeback for the purposes of stalling. + error_remove_page: normally set to generic_error_remove_page if truncation is ok for this address space. Used for memory failure handling. Setting this implies you deal with pages going away under you, @@ -777,7 +791,7 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - int (*readdir) (struct file *, void *, filldir_t); + int (*iterate) (struct file *, struct dir_context *); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); @@ -815,7 +829,7 @@ otherwise noted. aio_write: called by io_submit(2) and other asynchronous I/O operations - readdir: called when the VFS needs to read the directory contents + iterate: called when the VFS needs to read the directory contents poll: called by the VFS when a process wants to check if there is activity on this file and (optionally) go to sleep until there @@ -900,10 +914,9 @@ defined: struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); - int (*d_hash)(const struct dentry *, const struct inode *, - struct qstr *); - int (*d_compare)(const struct dentry *, const struct inode *, - const struct dentry *, const struct inode *, + int (*d_weak_revalidate)(struct dentry *, unsigned int); + int (*d_hash)(const struct dentry *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); @@ -915,8 +928,13 @@ struct dentry_operations { d_revalidate: called when the VFS needs to revalidate a dentry. This is called whenever a name look-up finds a dentry in the - dcache. Most filesystems leave this as NULL, because all their - dentries in the dcache are valid + dcache. Most local filesystems leave this as NULL, because all their + dentries in the dcache are valid. Network filesystems are different + since things can change on the server without the client necessarily + being aware of it. + + This function should return a positive value if the dentry is still + valid, and zero or a negative error code if it isn't. d_revalidate may be called in rcu-walk mode (flags & LOOKUP_RCU). If in rcu-walk mode, the filesystem must revalidate the dentry without @@ -927,27 +945,40 @@ struct dentry_operations { If a situation is encountered that rcu-walk cannot handle, return -ECHILD and it will be called again in ref-walk mode. + d_weak_revalidate: called when the VFS needs to revalidate a "jumped" dentry. + This is called when a path-walk ends at dentry that was not acquired by + doing a lookup in the parent directory. This includes "/", "." and "..", + as well as procfs-style symlinks and mountpoint traversal. + + In this case, we are less concerned with whether the dentry is still + fully correct, but rather that the inode is still valid. As with + d_revalidate, most local filesystems will set this to NULL since their + dcache entries are always valid. + + This function has the same return code semantics as d_revalidate. + + d_weak_revalidate is only called after leaving rcu-walk mode. + d_hash: called when the VFS adds a dentry to the hash table. The first dentry passed to d_hash is the parent directory that the name is - to be hashed into. The inode is the dentry's inode. + to be hashed into. Same locking and synchronisation rules as d_compare regarding what is safe to dereference etc. d_compare: called to compare a dentry name with a given name. The first dentry is the parent of the dentry to be compared, the second is - the parent's inode, then the dentry and inode (may be NULL) of the - child dentry. len and name string are properties of the dentry to be - compared. qstr is the name to compare it with. + the child dentry. len and name string are properties of the dentry + to be compared. qstr is the name to compare it with. Must be constant and idempotent, and should not take locks if - possible, and should not or store into the dentry or inodes. - Should not dereference pointers outside the dentry or inodes without + possible, and should not or store into the dentry. + Should not dereference pointers outside the dentry without lots of care (eg. d_parent, d_inode, d_name should not be used). However, our vfsmount is pinned, and RCU held, so the dentries and inodes won't disappear, neither will our sb or filesystem module. - ->i_sb and ->d_sb may be used. + ->d_sb may be used. It is a tricky calling convention because it needs to be called under "rcu-walk", ie. without any locks or references on things. diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt new file mode 100644 index 000000000000..05aa455163e3 --- /dev/null +++ b/Documentation/filesystems/xfs-self-describing-metadata.txt @@ -0,0 +1,350 @@ +XFS Self Describing Metadata +---------------------------- + +Introduction +------------ + +The largest scalability problem facing XFS is not one of algorithmic +scalability, but of verification of the filesystem structure. Scalabilty of the +structures and indexes on disk and the algorithms for iterating them are +adequate for supporting PB scale filesystems with billions of inodes, however it +is this very scalability that causes the verification problem. + +Almost all metadata on XFS is dynamically allocated. The only fixed location +metadata is the allocation group headers (SB, AGF, AGFL and AGI), while all +other metadata structures need to be discovered by walking the filesystem +structure in different ways. While this is already done by userspace tools for +validating and repairing the structure, there are limits to what they can +verify, and this in turn limits the supportable size of an XFS filesystem. + +For example, it is entirely possible to manually use xfs_db and a bit of +scripting to analyse the structure of a 100TB filesystem when trying to +determine the root cause of a corruption problem, but it is still mainly a +manual task of verifying that things like single bit errors or misplaced writes +weren't the ultimate cause of a corruption event. It may take a few hours to a +few days to perform such forensic analysis, so for at this scale root cause +analysis is entirely possible. + +However, if we scale the filesystem up to 1PB, we now have 10x as much metadata +to analyse and so that analysis blows out towards weeks/months of forensic work. +Most of the analysis work is slow and tedious, so as the amount of analysis goes +up, the more likely that the cause will be lost in the noise. Hence the primary +concern for supporting PB scale filesystems is minimising the time and effort +required for basic forensic analysis of the filesystem structure. + + +Self Describing Metadata +------------------------ + +One of the problems with the current metadata format is that apart from the +magic number in the metadata block, we have no other way of identifying what it +is supposed to be. We can't even identify if it is the right place. Put simply, +you can't look at a single metadata block in isolation and say "yes, it is +supposed to be there and the contents are valid". + +Hence most of the time spent on forensic analysis is spent doing basic +verification of metadata values, looking for values that are in range (and hence +not detected by automated verification checks) but are not correct. Finding and +understanding how things like cross linked block lists (e.g. sibling +pointers in a btree end up with loops in them) are the key to understanding what +went wrong, but it is impossible to tell what order the blocks were linked into +each other or written to disk after the fact. + +Hence we need to record more information into the metadata to allow us to +quickly determine if the metadata is intact and can be ignored for the purpose +of analysis. We can't protect against every possible type of error, but we can +ensure that common types of errors are easily detectable. Hence the concept of +self describing metadata. + +The first, fundamental requirement of self describing metadata is that the +metadata object contains some form of unique identifier in a well known +location. This allows us to identify the expected contents of the block and +hence parse and verify the metadata object. IF we can't independently identify +the type of metadata in the object, then the metadata doesn't describe itself +very well at all! + +Luckily, almost all XFS metadata has magic numbers embedded already - only the +AGFL, remote symlinks and remote attribute blocks do not contain identifying +magic numbers. Hence we can change the on-disk format of all these objects to +add more identifying information and detect this simply by changing the magic +numbers in the metadata objects. That is, if it has the current magic number, +the metadata isn't self identifying. If it contains a new magic number, it is +self identifying and we can do much more expansive automated verification of the +metadata object at runtime, during forensic analysis or repair. + +As a primary concern, self describing metadata needs some form of overall +integrity checking. We cannot trust the metadata if we cannot verify that it has +not been changed as a result of external influences. Hence we need some form of +integrity check, and this is done by adding CRC32c validation to the metadata +block. If we can verify the block contains the metadata it was intended to +contain, a large amount of the manual verification work can be skipped. + +CRC32c was selected as metadata cannot be more than 64k in length in XFS and +hence a 32 bit CRC is more than sufficient to detect multi-bit errors in +metadata blocks. CRC32c is also now hardware accelerated on common CPUs so it is +fast. So while CRC32c is not the strongest of possible integrity checks that +could be used, it is more than sufficient for our needs and has relatively +little overhead. Adding support for larger integrity fields and/or algorithms +does really provide any extra value over CRC32c, but it does add a lot of +complexity and so there is no provision for changing the integrity checking +mechanism. + +Self describing metadata needs to contain enough information so that the +metadata block can be verified as being in the correct place without needing to +look at any other metadata. This means it needs to contain location information. +Just adding a block number to the metadata is not sufficient to protect against +mis-directed writes - a write might be misdirected to the wrong LUN and so be +written to the "correct block" of the wrong filesystem. Hence location +information must contain a filesystem identifier as well as a block number. + +Another key information point in forensic analysis is knowing who the metadata +block belongs to. We already know the type, the location, that it is valid +and/or corrupted, and how long ago that it was last modified. Knowing the owner +of the block is important as it allows us to find other related metadata to +determine the scope of the corruption. For example, if we have a extent btree +object, we don't know what inode it belongs to and hence have to walk the entire +filesystem to find the owner of the block. Worse, the corruption could mean that +no owner can be found (i.e. it's an orphan block), and so without an owner field +in the metadata we have no idea of the scope of the corruption. If we have an +owner field in the metadata object, we can immediately do top down validation to +determine the scope of the problem. + +Different types of metadata have different owner identifiers. For example, +directory, attribute and extent tree blocks are all owned by an inode, whilst +freespace btree blocks are owned by an allocation group. Hence the size and +contents of the owner field are determined by the type of metadata object we are +looking at. The owner information can also identify misplaced writes (e.g. +freespace btree block written to the wrong AG). + +Self describing metadata also needs to contain some indication of when it was +written to the filesystem. One of the key information points when doing forensic +analysis is how recently the block was modified. Correlation of set of corrupted +metadata blocks based on modification times is important as it can indicate +whether the corruptions are related, whether there's been multiple corruption +events that lead to the eventual failure, and even whether there are corruptions +present that the run-time verification is not detecting. + +For example, we can determine whether a metadata object is supposed to be free +space or still allocated if it is still referenced by its owner by looking at +when the free space btree block that contains the block was last written +compared to when the metadata object itself was last written. If the free space +block is more recent than the object and the object's owner, then there is a +very good chance that the block should have been removed from the owner. + +To provide this "written timestamp", each metadata block gets the Log Sequence +Number (LSN) of the most recent transaction it was modified on written into it. +This number will always increase over the life of the filesystem, and the only +thing that resets it is running xfs_repair on the filesystem. Further, by use of +the LSN we can tell if the corrupted metadata all belonged to the same log +checkpoint and hence have some idea of how much modification occurred between +the first and last instance of corrupt metadata on disk and, further, how much +modification occurred between the corruption being written and when it was +detected. + +Runtime Validation +------------------ + +Validation of self-describing metadata takes place at runtime in two places: + + - immediately after a successful read from disk + - immediately prior to write IO submission + +The verification is completely stateless - it is done independently of the +modification process, and seeks only to check that the metadata is what it says +it is and that the metadata fields are within bounds and internally consistent. +As such, we cannot catch all types of corruption that can occur within a block +as there may be certain limitations that operational state enforces of the +metadata, or there may be corruption of interblock relationships (e.g. corrupted +sibling pointer lists). Hence we still need stateful checking in the main code +body, but in general most of the per-field validation is handled by the +verifiers. + +For read verification, the caller needs to specify the expected type of metadata +that it should see, and the IO completion process verifies that the metadata +object matches what was expected. If the verification process fails, then it +marks the object being read as EFSCORRUPTED. The caller needs to catch this +error (same as for IO errors), and if it needs to take special action due to a +verification error it can do so by catching the EFSCORRUPTED error value. If we +need more discrimination of error type at higher levels, we can define new +error numbers for different errors as necessary. + +The first step in read verification is checking the magic number and determining +whether CRC validating is necessary. If it is, the CRC32c is calculated and +compared against the value stored in the object itself. Once this is validated, +further checks are made against the location information, followed by extensive +object specific metadata validation. If any of these checks fail, then the +buffer is considered corrupt and the EFSCORRUPTED error is set appropriately. + +Write verification is the opposite of the read verification - first the object +is extensively verified and if it is OK we then update the LSN from the last +modification made to the object, After this, we calculate the CRC and insert it +into the object. Once this is done the write IO is allowed to continue. If any +error occurs during this process, the buffer is again marked with a EFSCORRUPTED +error for the higher layers to catch. + +Structures +---------- + +A typical on-disk structure needs to contain the following information: + +struct xfs_ondisk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC, not logged */ + uuid_t uuid; /* filesystem identifier */ + __be64 owner; /* parent object */ + __be64 blkno; /* location on disk */ + __be64 lsn; /* last modification in log, not logged */ +}; + +Depending on the metadata, this information may be part of a header structure +separate to the metadata contents, or may be distributed through an existing +structure. The latter occurs with metadata that already contains some of this +information, such as the superblock and AG headers. + +Other metadata may have different formats for the information, but the same +level of information is generally provided. For example: + + - short btree blocks have a 32 bit owner (ag number) and a 32 bit block + number for location. The two of these combined provide the same + information as @owner and @blkno in eh above structure, but using 8 + bytes less space on disk. + + - directory/attribute node blocks have a 16 bit magic number, and the + header that contains the magic number has other information in it as + well. hence the additional metadata headers change the overall format + of the metadata. + +A typical buffer read verifier is structured as follows: + +#define XFS_FOO_CRC_OFF offsetof(struct xfs_ondisk_hdr, crc) + +static void +xfs_foo_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if ((xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + XFS_FOO_CRC_OFF)) || + !xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + } +} + +The code ensures that the CRC is only checked if the filesystem has CRCs enabled +by checking the superblock of the feature bit, and then if the CRC verifies OK +(or is not needed) it verifies the actual contents of the block. + +The verifier function will take a couple of different forms, depending on +whether the magic number can be used to determine the format of the block. In +the case it can't, the code is structured as follows: + +static bool +xfs_foo_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; + + if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } + + /* object specific verification checks here */ + + return true; +} + +If there are different magic numbers for the different formats, the verifier +will look like: + +static bool +xfs_foo_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_ondisk_hdr *hdr = bp->b_addr; + + if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) { + if (!uuid_equal(&hdr->uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(hdr->blkno)) + return false; + if (hdr->owner == 0) + return false; + } else if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC)) + return false; + + /* object specific verification checks here */ + + return true; +} + +Write verifiers are very similar to the read verifiers, they just do things in +the opposite order to the read verifiers. A typical write verifier: + +static void +xfs_foo_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_foo_verify(bp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + + if (bip) { + struct xfs_ondisk_hdr *hdr = bp->b_addr; + hdr->lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_FOO_CRC_OFF); +} + +This will verify the internal structure of the metadata before we go any +further, detecting corruptions that have occurred as the metadata has been +modified in memory. If the metadata verifies OK, and CRCs are enabled, we then +update the LSN field (when it was last modified) and calculate the CRC on the +metadata. Once this is done, we can issue the IO. + +Inodes and Dquots +----------------- + +Inodes and dquots are special snowflakes. They have per-object CRC and +self-identifiers, but they are packed so that there are multiple objects per +buffer. Hence we do not use per-buffer verifiers to do the work of per-object +verification and CRC calculations. The per-buffer verifiers simply perform basic +identification of the buffer - that they contain inodes or dquots, and that +there are magic numbers in all the expected spots. All further CRC and +verification checks are done when each inode is read from or written back to the +buffer. + +The structure of the verifiers and the identifiers checks is very similar to the +buffer code described above. The only difference is where they are called. For +example, inode read verification is done in xfs_iread() when the inode is first +read out of the buffer and the struct xfs_inode is instantiated. The inode is +already extensively verified during writeback in xfs_iflush_int, so the only +addition here is to add the LSN and CRC to the inode as it is copied back into +the buffer. + +XXX: inode unlinked list modification doesn't recalculate the inode CRC! None of +the unlinked list modifications check or update CRCs, neither during unlink nor +log recovery. So, it's gone unnoticed until now. This won't matter immediately - +repair will probably complain about it - but it needs to be fixed. + diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 3e4b3dd1e046..83577f0232a0 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -33,6 +33,9 @@ When mounting an XFS filesystem, the following options are accepted. removing extended attributes) the on-disk superblock feature bit field will be updated to reflect this format being in use. + CRC enabled filesystems always use the attr2 format, and so + will reject the noattr2 mount option if it is set. + barrier Enables the use of block layer write barriers for writes into the journal and unwritten extent conversion. This allows for |