diff options
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r-- | fs/btrfs/ioctl.c | 358 |
1 files changed, 321 insertions, 37 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5bd6926f7ff..8d47ec5fc4f4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -805,10 +805,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto fail; } - spin_lock(&fs_info->trans_lock); - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - spin_unlock(&fs_info->trans_lock); + trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) @@ -1015,8 +1012,155 @@ out: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - bool locked) + u64 newer_than, bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -1031,6 +1175,20 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + if (!em) { struct extent_state *cached = NULL; u64 end = start + sectorsize - 1; @@ -1038,7 +1196,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* get the big lock and read metadata off disk */ if (!locked) lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) unlock_extent_cached(io_tree, start, end, &cached); @@ -1049,23 +1207,42 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } +static u32 get_extent_max_capacity(const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return BTRFS_MAX_EXTENT_SIZE; +} + static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, bool locked) { struct extent_map *next; - bool ret = true; + bool ret = false; /* this is the last extent */ if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len, locked); + /* + * We want to check if the next extent can be merged with the current + * one, which can be an extent created in a past generation, so we pass + * a minimum generation of 0 to defrag_lookup_extent(). + */ + next = defrag_lookup_extent(inode, em->start + em->len, 0, locked); + /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - ret = false; - else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > SZ_128K && next->block_len > SZ_128K)) - ret = false; - + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(em)) + goto out; + ret = true; +out: free_extent_map(next); return ret; } @@ -1189,8 +1366,10 @@ struct defrag_target_range { static int defrag_collect_targets(struct btrfs_inode *inode, u64 start, u64 len, u32 extent_thresh, u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list) + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) { + bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1200,7 +1379,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool next_mergeable = true; u64 range_len; - em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, + newer_than, locked); if (!em) break; @@ -1213,6 +1394,39 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->generation < newer_than) goto next; + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + /* * For do_compress case, we want to compress all valid file * extents, thus no @extent_thresh or mergeable check. @@ -1221,7 +1435,14 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (em->len >= extent_thresh) + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(em)) goto next; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, @@ -1242,6 +1463,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, } add: + last_is_target = true; range_len = min(extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into @@ -1285,6 +1507,17 @@ next: kfree(entry); } } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } return ret; } @@ -1343,7 +1576,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, } static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress) + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) { struct extent_state *cached_state = NULL; struct defrag_target_range *entry; @@ -1389,7 +1623,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, */ ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, true, - &target_list); + &target_list, last_scanned_ret); if (ret < 0) goto unlock_extent; @@ -1424,7 +1658,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u64 start, u32 len, u32 extent_thresh, u64 newer_than, bool do_compress, unsigned long *sectors_defragged, - unsigned long max_sectors) + unsigned long max_sectors, + u64 *last_scanned_ret) { const u32 sectorsize = inode->root->fs_info->sectorsize; struct defrag_target_range *entry; @@ -1435,21 +1670,32 @@ static int defrag_one_cluster(struct btrfs_inode *inode, BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); ret = defrag_collect_targets(inode, start, len, extent_thresh, newer_than, do_compress, false, - &target_list); + &target_list, NULL); if (ret < 0) goto out; list_for_each_entry(entry, &target_list, list) { u32 range_len = entry->len; - /* Reached the limit */ - if (max_sectors && max_sectors == *sectors_defragged) + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; break; + } if (max_sectors) range_len = min_t(u32, range_len, (max_sectors - *sectors_defragged) * sectorsize); + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + if (ra) page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, @@ -1462,16 +1708,20 @@ static int defrag_one_cluster(struct btrfs_inode *inode, * accounting. */ ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress); + extent_thresh, newer_than, do_compress, + last_scanned_ret); if (ret < 0) break; - *sectors_defragged += range_len; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; } out: list_for_each_entry_safe(entry, tmp, &target_list, list) { list_del_init(&entry->list); kfree(entry); } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); return ret; } @@ -1484,6 +1734,12 @@ out: * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). */ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, @@ -1499,6 +1755,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; if (isize == 0) return 0; @@ -1518,12 +1775,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (range->start + range->len > range->start) { /* Got a specific range */ - last_byte = min(isize, range->start + range->len) - 1; + last_byte = min(isize, range->start + range->len); } else { /* Defrag until file end */ - last_byte = isize - 1; + last_byte = isize; } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so @@ -1536,16 +1797,27 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, file_ra_state_init(ra, inode->i_mapping); } - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; u64 cluster_end; /* The cluster size 256K should always be page aligned */ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + /* We want the cluster end at page boundary when possible */ cluster_end = (((cur >> PAGE_SHIFT) + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; @@ -1565,16 +1837,30 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, BTRFS_I(inode)->defrag_compress = compress_type; ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, - §ors_defragged, max_to_defrag); + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; - cur = cluster_end + 1; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); } if (ra_allocated) kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; if (sectors_defragged) { /* * We have defragged some sectors, for compression case they @@ -3086,10 +3372,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); - if (!err) { - fsnotify_rmdir(dir, dentry); - d_delete(dentry); - } + if (!err) + d_delete_notify(dir, dentry); out_dput: dput(dentry); @@ -3290,7 +3574,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct block_device *bdev = NULL; fmode_t mode; int ret; - bool cancel; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; |