From 5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Wed, 10 Jun 2009 10:45:14 -0400 Subject: Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE) This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2624b53ea783..54dfd45cc591 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -82,22 +82,25 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - objectid, trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; } - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); write_extent_buffer(leaf, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item.inode; @@ -125,7 +128,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_dirid(&root_item, new_dirid); key.objectid = objectid; - key.offset = 1; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); @@ -911,10 +914,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, - disko, diskl, leaf->start, - root->root_key.objectid, - trans->transid, - inode->i_ino); + disko, diskl, 0, + root->root_key.objectid, + inode->i_ino, + new_key.offset - datao); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { -- cgit From 6cbff00f4632c8060b06bfc9585805217f11e12e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2009 10:37:41 +0200 Subject: Btrfs: implement FS_IOC_GETFLAGS/SETFLAGS/GETVERSION Add support for the standard attributes set via chattr and read via lsattr. Currently we store the attributes in the flags value in the btrfs inode, but I wonder whether we should split it into two so that we don't have to keep converting between the two formats. Remove the btrfs_clear_flag/btrfs_set_flag/btrfs_test_flag macros as they were confusing the existing code and got in the way of the new additions. Also add the FS_IOC_GETVERSION ioctl for getting i_generation as it's trivial. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 1 - fs/btrfs/compression.c | 6 +- fs/btrfs/ctree.h | 16 +++-- fs/btrfs/inode.c | 27 ++++---- fs/btrfs/ioctl.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 200 insertions(+), 21 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ecf5f7d8166f..acb4f3517582 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -157,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } - #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ab07627084f1..de1e2fd32080 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (btrfs_test_flag(inode, NODATASUM)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ atomic_inc(&cb->pending_bios); - if (!btrfs_test_flag(inode, NODATASUM)) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_lookup_bio_sums(root, inode, comp_bio, sums); } @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!btrfs_test_flag(inode, NODATASUM)) + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) btrfs_lookup_bio_sums(root, inode, comp_bio, sums); ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5fa7d7d287a4..4d6e0b6f21ea 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1115,12 +1115,14 @@ struct btrfs_root { #define BTRFS_INODE_READONLY (1 << 2) #define BTRFS_INODE_NOCOMPRESS (1 << 3) #define BTRFS_INODE_PREALLOC (1 << 4) -#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ - ~BTRFS_INODE_##flag) -#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ - BTRFS_INODE_##flag) -#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ - BTRFS_INODE_##flag) +#define BTRFS_INODE_SYNC (1 << 5) +#define BTRFS_INODE_IMMUTABLE (1 << 6) +#define BTRFS_INODE_APPEND (1 << 7) +#define BTRFS_INODE_NODUMP (1 << 8) +#define BTRFS_INODE_NOATIME (1 << 9) +#define BTRFS_INODE_DIRSYNC (1 << 10) + + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -2260,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void btrfs_update_iflags(struct inode *inode); +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); /* file.c */ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 917bf10597c6..5b68330f8585 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -368,7 +368,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!btrfs_test_flag(inode, NOCOMPRESS) && + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && btrfs_test_opt(root, COMPRESS)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); @@ -469,7 +469,7 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - btrfs_set_flag(inode, NOCOMPRESS); + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } if (will_compress) { *num_added += 1; @@ -862,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (btrfs_test_flag(inode, NOCOMPRESS)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); @@ -1133,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (btrfs_test_flag(inode, NODATACOW)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (btrfs_test_flag(inode, PREALLOC)) + else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); else if (!btrfs_test_opt(root, COMPRESS)) @@ -1290,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; - skip_sum = btrfs_test_flag(inode, NODATASUM); + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); @@ -1790,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, ClearPageChecked(page); goto good; } - if (btrfs_test_flag(inode, NODATASUM)) + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && @@ -2156,6 +2157,8 @@ static void btrfs_read_locked_inode(struct inode *inode) init_special_inode(inode, inode->i_mode, rdev); break; } + + btrfs_update_iflags(inode); return; make_bad: @@ -3586,9 +3589,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_find_block_group(root, 0, alloc_hint, owner); if ((mode & S_IFREG)) { if (btrfs_test_opt(root, NODATASUM)) - btrfs_set_flag(inode, NODATASUM); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - btrfs_set_flag(inode, NODATACOW); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } key[0].objectid = objectid; @@ -3642,6 +3645,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location->offset = 0; btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + btrfs_inherit_iflags(inode, dir); + insert_inode_hash(inode); inode_tree_add(inode); return inode; @@ -5075,7 +5080,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, out: if (cur_offset > start) { inode->i_ctime = CURRENT_TIME; - btrfs_set_flag(inode, PREALLOC); + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && cur_offset > i_size_read(inode)) btrfs_i_size_write(inode, cur_offset); @@ -5196,7 +5201,7 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask) { - if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) + if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, btrfs_check_acl); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 54dfd45cc591..926332a73cde 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,7 +50,172 @@ #include "volumes.h" #include "locking.h" +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + */ +static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_update_iflags(struct inode *inode) +{ + struct btrfs_inode *ip = BTRFS_I(inode); + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + + if (ip->flags & BTRFS_INODE_SYNC) + inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; +} + +/* + * Inherit flags from the parent inode. + * + * Unlike extN we don't have any flags we don't want to inherit currently. + */ +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags = BTRFS_I(dir)->flags; + + if (S_ISREG(inode->i_mode)) + flags &= ~BTRFS_INODE_DIRSYNC; + else if (!S_ISDIR(inode->i_mode)) + flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + + BTRFS_I(inode)->flags = flags; + btrfs_update_iflags(inode); +} + +static int btrfs_ioctl_getflags(struct file *file, void __user *arg) +{ + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_root *root = ip->root; + struct btrfs_trans_handle *trans; + unsigned int flags, oldflags; + int ret; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL)) + return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EACCES; + + mutex_lock(&inode->i_mutex); + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out_unlock; + + if (flags & FS_SYNC_FL) + ip->flags |= BTRFS_INODE_SYNC; + else + ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else + ip->flags &= ~BTRFS_INODE_APPEND; + if (flags & FS_NODUMP_FL) + ip->flags |= BTRFS_INODE_NODUMP; + else + ip->flags &= ~BTRFS_INODE_NODUMP; + if (flags & FS_NOATIME_FL) + ip->flags |= BTRFS_INODE_NOATIME; + else + ip->flags &= ~BTRFS_INODE_NOATIME; + if (flags & FS_DIRSYNC_FL) + ip->flags |= BTRFS_INODE_DIRSYNC; + else + ip->flags &= ~BTRFS_INODE_DIRSYNC; + + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_end_transaction(trans, root); + + mnt_drop_write(file->f_path.mnt); + out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + return put_user(inode->i_generation, arg); +} static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, @@ -1077,6 +1242,12 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { + case FS_IOC_GETFLAGS: + return btrfs_ioctl_getflags(file, argp); + case FS_IOC_SETFLAGS: + return btrfs_ioctl_setflags(file, argp); + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: -- cgit From 0b4dcea579a1b6f4d249d61f5bc8adeaa7c895d8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 11 Jun 2009 11:13:35 -0400 Subject: Btrfs: fix oops when btrfs_inherit_iflags called with a NULL dir This happens during subvol creation. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 926332a73cde..eff18f5b5362 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -112,7 +112,12 @@ void btrfs_update_iflags(struct inode *inode) */ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { - unsigned int flags = BTRFS_I(dir)->flags; + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; if (S_ISREG(inode->i_mode)) flags &= ~BTRFS_INODE_DIRSYNC; -- cgit