diff options
Diffstat (limited to 'fs/btrfs/ioctl.c')
| -rw-r--r-- | fs/btrfs/ioctl.c | 1013 | 
1 files changed, 489 insertions, 524 deletions
| diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cc61813213d8..fb8cc9642ac4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -48,6 +48,7 @@  #include "space-info.h"  #include "delalloc-space.h"  #include "block-group.h" +#include "subpage.h"  #ifdef CONFIG_64BIT  /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 {  	compat_uptr_t clone_sources;	/* in */  	__u64 parent_root;		/* in */  	__u64 flags;			/* in */ -	__u64 reserved[4];		/* in */ +	__u32 version;			/* in */ +	__u8  reserved[28];		/* in */  } __attribute__ ((__packed__));  #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ @@ -985,129 +987,32 @@ out:  	return ret;  } -/* - * When we're defragging a range, we don't want to kick it off again - * if it is really just waiting for delalloc to send it down. - * If we find a nice big extent or delalloc range for the bytes in the - * file you want to defrag, we return 0 to let you know to skip this - * part of the file - */ -static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) -{ -	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; -	struct extent_map *em = NULL; -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; -	u64 end; - -	read_lock(&em_tree->lock); -	em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); -	read_unlock(&em_tree->lock); - -	if (em) { -		end = extent_map_end(em); -		free_extent_map(em); -		if (end - offset > thresh) -			return 0; -	} -	/* if we already have a nice delalloc here, just stop */ -	thresh /= 2; -	end = count_range_bits(io_tree, &offset, offset + thresh, -			       thresh, EXTENT_DELALLOC, 1); -	if (end >= thresh) -		return 0; -	return 1; -} - -/* - * helper function to walk through a file and find extents - * newer than a specific transid, and smaller than thresh. - * - * This is used by the defragging code to find new and small - * extents - */ -static int find_new_extents(struct btrfs_root *root, -			    struct inode *inode, u64 newer_than, -			    u64 *off, u32 thresh) -{ -	struct btrfs_path *path; -	struct btrfs_key min_key; -	struct extent_buffer *leaf; -	struct btrfs_file_extent_item *extent; -	int type; -	int ret; -	u64 ino = btrfs_ino(BTRFS_I(inode)); - -	path = btrfs_alloc_path(); -	if (!path) -		return -ENOMEM; - -	min_key.objectid = ino; -	min_key.type = BTRFS_EXTENT_DATA_KEY; -	min_key.offset = *off; - -	while (1) { -		ret = btrfs_search_forward(root, &min_key, path, newer_than); -		if (ret != 0) -			goto none; -process_slot: -		if (min_key.objectid != ino) -			goto none; -		if (min_key.type != BTRFS_EXTENT_DATA_KEY) -			goto none; - -		leaf = path->nodes[0]; -		extent = btrfs_item_ptr(leaf, path->slots[0], -					struct btrfs_file_extent_item); - -		type = btrfs_file_extent_type(leaf, extent); -		if (type == BTRFS_FILE_EXTENT_REG && -		    btrfs_file_extent_num_bytes(leaf, extent) < thresh && -		    check_defrag_in_cache(inode, min_key.offset, thresh)) { -			*off = min_key.offset; -			btrfs_free_path(path); -			return 0; -		} - -		path->slots[0]++; -		if (path->slots[0] < btrfs_header_nritems(leaf)) { -			btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); -			goto process_slot; -		} - -		if (min_key.offset == (u64)-1) -			goto none; - -		min_key.offset++; -		btrfs_release_path(path); -	} -none: -	btrfs_free_path(path); -	return -ENOENT; -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, +					       bool locked)  {  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;  	struct extent_map *em; -	u64 len = PAGE_SIZE; +	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;  	/*  	 * hopefully we have this extent in the tree already, try without  	 * the full extent lock  	 */  	read_lock(&em_tree->lock); -	em = lookup_extent_mapping(em_tree, start, len); +	em = lookup_extent_mapping(em_tree, start, sectorsize);  	read_unlock(&em_tree->lock);  	if (!em) {  		struct extent_state *cached = NULL; -		u64 end = start + len - 1; +		u64 end = start + sectorsize - 1;  		/* get the big lock and read metadata off disk */ -		lock_extent_bits(io_tree, start, end, &cached); -		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); -		unlock_extent_cached(io_tree, start, end, &cached); +		if (!locked) +			lock_extent_bits(io_tree, start, end, &cached); +		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); +		if (!locked) +			unlock_extent_cached(io_tree, start, end, &cached);  		if (IS_ERR(em))  			return NULL; @@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)  	return em;  } -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, +				     bool locked)  {  	struct extent_map *next;  	bool ret = true; @@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)  	if (em->start + em->len >= i_size_read(inode))  		return false; -	next = defrag_lookup_extent(inode, em->start + em->len); +	next = defrag_lookup_extent(inode, em->start + em->len, locked);  	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)  		ret = false;  	else if ((em->block_start + em->block_len == next->block_start) && @@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)  	return ret;  } -static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, -			       u64 *last_len, u64 *skip, u64 *defrag_end, -			       int compress) +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, +					    pgoff_t index)  { -	struct extent_map *em; -	int ret = 1; -	bool next_mergeable = true; -	bool prev_mergeable = true; +	struct address_space *mapping = inode->vfs_inode.i_mapping; +	gfp_t mask = btrfs_alloc_write_mask(mapping); +	u64 page_start = (u64)index << PAGE_SHIFT; +	u64 page_end = page_start + PAGE_SIZE - 1; +	struct extent_state *cached_state = NULL; +	struct page *page; +	int ret; + +again: +	page = find_or_create_page(mapping, index, mask); +	if (!page) +		return ERR_PTR(-ENOMEM);  	/* -	 * make sure that once we start defragging an extent, we keep on -	 * defragging it +	 * Since we can defragment files opened read-only, we can encounter +	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We +	 * can't do I/O using huge pages yet, so return an error for now. +	 * Filesystem transparent huge pages are typically only used for +	 * executables that explicitly enable them, so this isn't very +	 * restrictive.  	 */ -	if (start < *defrag_end) -		return 1; +	if (PageCompound(page)) { +		unlock_page(page); +		put_page(page); +		return ERR_PTR(-ETXTBSY); +	} -	*skip = 0; +	ret = set_page_extent_mapped(page); +	if (ret < 0) { +		unlock_page(page); +		put_page(page); +		return ERR_PTR(ret); +	} -	em = defrag_lookup_extent(inode, start); -	if (!em) -		return 0; +	/* Wait for any existing ordered extent in the range */ +	while (1) { +		struct btrfs_ordered_extent *ordered; -	/* this will cover holes, and inline extents */ -	if (em->block_start >= EXTENT_MAP_LAST_BYTE) { -		ret = 0; -		goto out; -	} +		lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); +		ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); +		unlock_extent_cached(&inode->io_tree, page_start, page_end, +				     &cached_state); +		if (!ordered) +			break; -	if (!*defrag_end) -		prev_mergeable = false; +		unlock_page(page); +		btrfs_start_ordered_extent(ordered, 1); +		btrfs_put_ordered_extent(ordered); +		lock_page(page); +		/* +		 * We unlocked the page above, so we need check if it was +		 * released or not. +		 */ +		if (page->mapping != mapping || !PagePrivate(page)) { +			unlock_page(page); +			put_page(page); +			goto again; +		} +	} -	next_mergeable = defrag_check_next_extent(inode, em); -	/* -	 * we hit a real extent, if it is big or the next extent is not a -	 * real extent, don't bother defragging it -	 */ -	if (!compress && (*last_len == 0 || *last_len >= thresh) && -	    (em->len >= thresh || (!next_mergeable && !prev_mergeable))) -		ret = 0; -out:  	/* -	 * last_len ends up being a counter of how many bytes we've defragged. -	 * every time we choose not to defrag an extent, we reset *last_len -	 * so that the next tiny extent will force a defrag. -	 * -	 * The end result of this is that tiny extents before a single big -	 * extent will force at least part of that big extent to be defragged. +	 * Now the page range has no ordered extent any more.  Read the page to +	 * make it uptodate.  	 */ -	if (ret) { -		*defrag_end = extent_map_end(em); -	} else { -		*last_len = 0; -		*skip = extent_map_end(em); -		*defrag_end = 0; +	if (!PageUptodate(page)) { +		btrfs_readpage(NULL, page); +		lock_page(page); +		if (page->mapping != mapping || !PagePrivate(page)) { +			unlock_page(page); +			put_page(page); +			goto again; +		} +		if (!PageUptodate(page)) { +			unlock_page(page); +			put_page(page); +			return ERR_PTR(-EIO); +		}  	} - -	free_extent_map(em); -	return ret; +	return page;  } +struct defrag_target_range { +	struct list_head list; +	u64 start; +	u64 len; +}; +  /* - * it doesn't do much good to defrag one or two pages - * at a time.  This pulls in a nice chunk of pages - * to COW and defrag. - * - * It also makes sure the delalloc code has enough - * dirty data to avoid making new small extents as part - * of the defrag + * Collect all valid target extents.   * - * It's a good idea to start RA on this range - * before calling this. + * @start:	   file offset to lookup + * @len:	   length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + *		   will be ignored + * @newer_than:    only defrag extents newer than this value + * @do_compress:   whether the defrag is doing compression + *		   if true, @extent_thresh will be ignored and all regular + *		   file extents meeting @newer_than will be targets. + * @locked:	   if the range has already held extent lock + * @target_list:   list of targets file extents   */ -static int cluster_pages_for_defrag(struct inode *inode, -				    struct page **pages, -				    unsigned long start_index, -				    unsigned long num_pages) +static int defrag_collect_targets(struct btrfs_inode *inode, +				  u64 start, u64 len, u32 extent_thresh, +				  u64 newer_than, bool do_compress, +				  bool locked, struct list_head *target_list)  { -	unsigned long file_end; -	u64 isize = i_size_read(inode); -	u64 page_start; -	u64 page_end; -	u64 page_cnt; -	u64 start = (u64)start_index << PAGE_SHIFT; -	u64 search_start; -	int ret; -	int i; -	int i_done; -	struct btrfs_ordered_extent *ordered; -	struct extent_state *cached_state = NULL; -	struct extent_io_tree *tree; -	struct extent_changeset *data_reserved = NULL; -	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); +	u64 cur = start; +	int ret = 0; -	file_end = (isize - 1) >> PAGE_SHIFT; -	if (!isize || start_index > file_end) -		return 0; +	while (cur < start + len) { +		struct extent_map *em; +		struct defrag_target_range *new; +		bool next_mergeable = true; +		u64 range_len; -	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); +		em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); +		if (!em) +			break; -	ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, -			start, page_cnt << PAGE_SHIFT); -	if (ret) -		return ret; -	i_done = 0; -	tree = &BTRFS_I(inode)->io_tree; +		/* Skip hole/inline/preallocated extents */ +		if (em->block_start >= EXTENT_MAP_LAST_BYTE || +		    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) +			goto next; -	/* step one, lock all the pages */ -	for (i = 0; i < page_cnt; i++) { -		struct page *page; -again: -		page = find_or_create_page(inode->i_mapping, -					   start_index + i, mask); -		if (!page) -			break; +		/* Skip older extent */ +		if (em->generation < newer_than) +			goto next; -		ret = set_page_extent_mapped(page); -		if (ret < 0) { -			unlock_page(page); -			put_page(page); -			break; +		/* +		 * For do_compress case, we want to compress all valid file +		 * extents, thus no @extent_thresh or mergeable check. +		 */ +		if (do_compress) +			goto add; + +		/* Skip too large extent */ +		if (em->len >= extent_thresh) +			goto next; + +		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, +							  locked); +		if (!next_mergeable) { +			struct defrag_target_range *last; + +			/* Empty target list, no way to merge with last entry */ +			if (list_empty(target_list)) +				goto next; +			last = list_entry(target_list->prev, +					  struct defrag_target_range, list); +			/* Not mergeable with last entry */ +			if (last->start + last->len != cur) +				goto next; + +			/* Mergeable, fall through to add it to @target_list. */  		} -		page_start = page_offset(page); -		page_end = page_start + PAGE_SIZE - 1; -		while (1) { -			lock_extent_bits(tree, page_start, page_end, -					 &cached_state); -			ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), -							      page_start); -			unlock_extent_cached(tree, page_start, page_end, -					     &cached_state); -			if (!ordered) -				break; - -			unlock_page(page); -			btrfs_start_ordered_extent(ordered, 1); -			btrfs_put_ordered_extent(ordered); -			lock_page(page); -			/* -			 * we unlocked the page above, so we need check if -			 * it was released or not. -			 */ -			if (page->mapping != inode->i_mapping) { -				unlock_page(page); -				put_page(page); -				goto again; +add: +		range_len = min(extent_map_end(em), start + len) - cur; +		/* +		 * This one is a good target, check if it can be merged into +		 * last range of the target list. +		 */ +		if (!list_empty(target_list)) { +			struct defrag_target_range *last; + +			last = list_entry(target_list->prev, +					  struct defrag_target_range, list); +			ASSERT(last->start + last->len <= cur); +			if (last->start + last->len == cur) { +				/* Mergeable, enlarge the last entry */ +				last->len += range_len; +				goto next;  			} +			/* Fall through to allocate a new entry */  		} -		if (!PageUptodate(page)) { -			btrfs_readpage(NULL, page); -			lock_page(page); -			if (!PageUptodate(page)) { -				unlock_page(page); -				put_page(page); -				ret = -EIO; -				break; -			} +		/* Allocate new defrag_target_range */ +		new = kmalloc(sizeof(*new), GFP_NOFS); +		if (!new) { +			free_extent_map(em); +			ret = -ENOMEM; +			break;  		} +		new->start = cur; +		new->len = range_len; +		list_add_tail(&new->list, target_list); -		if (page->mapping != inode->i_mapping) { -			unlock_page(page); -			put_page(page); -			goto again; +next: +		cur = extent_map_end(em); +		free_extent_map(em); +	} +	if (ret < 0) { +		struct defrag_target_range *entry; +		struct defrag_target_range *tmp; + +		list_for_each_entry_safe(entry, tmp, target_list, list) { +			list_del_init(&entry->list); +			kfree(entry);  		} +	} +	return ret; +} + +#define CLUSTER_SIZE	(SZ_256K) + +/* + * Defrag one contiguous target range. + * + * @inode:	target inode + * @target:	target range to defrag + * @pages:	locked pages covering the defrag range + * @nr_pages:	number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + *   Pages should be locked, no ordered extent in the pages range, + *   no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, +				    struct defrag_target_range *target, +				    struct page **pages, int nr_pages, +				    struct extent_state **cached_state) +{ +	struct btrfs_fs_info *fs_info = inode->root->fs_info; +	struct extent_changeset *data_reserved = NULL; +	const u64 start = target->start; +	const u64 len = target->len; +	unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; +	unsigned long start_index = start >> PAGE_SHIFT; +	unsigned long first_index = page_index(pages[0]); +	int ret = 0; +	int i; + +	ASSERT(last_index - first_index + 1 <= nr_pages); + +	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); +	if (ret < 0) +		return ret; +	clear_extent_bit(&inode->io_tree, start, start + len - 1, +			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | +			 EXTENT_DEFRAG, 0, 0, cached_state); +	set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); -		pages[i] = page; -		i_done++; +	/* Update the page status */ +	for (i = start_index - first_index; i <= last_index - first_index; i++) { +		ClearPageChecked(pages[i]); +		btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);  	} -	if (!i_done || ret) -		goto out; +	btrfs_delalloc_release_extents(inode, len); +	extent_changeset_free(data_reserved); -	if (!(inode->i_sb->s_flags & SB_ACTIVE)) -		goto out; +	return ret; +} -	/* -	 * so now we have a nice long stream of locked -	 * and up to date pages, lets wait on them -	 */ -	for (i = 0; i < i_done; i++) -		wait_on_page_writeback(pages[i]); +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, +			    u32 extent_thresh, u64 newer_than, bool do_compress) +{ +	struct extent_state *cached_state = NULL; +	struct defrag_target_range *entry; +	struct defrag_target_range *tmp; +	LIST_HEAD(target_list); +	struct page **pages; +	const u32 sectorsize = inode->root->fs_info->sectorsize; +	u64 last_index = (start + len - 1) >> PAGE_SHIFT; +	u64 start_index = start >> PAGE_SHIFT; +	unsigned int nr_pages = last_index - start_index + 1; +	int ret = 0; +	int i; -	page_start = page_offset(pages[0]); -	page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; +	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); +	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); + +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); +	if (!pages) +		return -ENOMEM; -	lock_extent_bits(&BTRFS_I(inode)->io_tree, -			 page_start, page_end - 1, &cached_state); +	/* Prepare all pages */ +	for (i = 0; i < nr_pages; i++) { +		pages[i] = defrag_prepare_one_page(inode, start_index + i); +		if (IS_ERR(pages[i])) { +			ret = PTR_ERR(pages[i]); +			pages[i] = NULL; +			goto free_pages; +		} +	} +	for (i = 0; i < nr_pages; i++) +		wait_on_page_writeback(pages[i]); +	/* Lock the pages range */ +	lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, +			 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, +			 &cached_state);  	/* -	 * When defragmenting we skip ranges that have holes or inline extents, -	 * (check should_defrag_range()), to avoid unnecessary IO and wasting -	 * space. At btrfs_defrag_file(), we check if a range should be defragged -	 * before locking the inode and then, if it should, we trigger a sync -	 * page cache readahead - we lock the inode only after that to avoid -	 * blocking for too long other tasks that possibly want to operate on -	 * other file ranges. But before we were able to get the inode lock, -	 * some other task may have punched a hole in the range, or we may have -	 * now an inline extent, in which case we should not defrag. So check -	 * for that here, where we have the inode and the range locked, and bail -	 * out if that happened. +	 * Now we have a consistent view about the extent map, re-check +	 * which range really needs to be defragged. +	 * +	 * And this time we have extent locked already, pass @locked = true +	 * so that we won't relock the extent range and cause deadlock.  	 */ -	search_start = page_start; -	while (search_start < page_end) { -		struct extent_map *em; +	ret = defrag_collect_targets(inode, start, len, extent_thresh, +				     newer_than, do_compress, true, +				     &target_list); +	if (ret < 0) +		goto unlock_extent; -		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, -				      page_end - search_start); -		if (IS_ERR(em)) { -			ret = PTR_ERR(em); -			goto out_unlock_range; -		} -		if (em->block_start >= EXTENT_MAP_LAST_BYTE) { -			free_extent_map(em); -			/* Ok, 0 means we did not defrag anything */ -			ret = 0; -			goto out_unlock_range; +	list_for_each_entry(entry, &target_list, list) { +		ret = defrag_one_locked_target(inode, entry, pages, nr_pages, +					       &cached_state); +		if (ret < 0) +			break; +	} + +	list_for_each_entry_safe(entry, tmp, &target_list, list) { +		list_del_init(&entry->list); +		kfree(entry); +	} +unlock_extent: +	unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, +			     (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, +			     &cached_state); +free_pages: +	for (i = 0; i < nr_pages; i++) { +		if (pages[i]) { +			unlock_page(pages[i]); +			put_page(pages[i]);  		} -		search_start = extent_map_end(em); -		free_extent_map(em);  	} +	kfree(pages); +	return ret; +} -	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, -			  page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | -			  EXTENT_DEFRAG, 0, 0, &cached_state); +static int defrag_one_cluster(struct btrfs_inode *inode, +			      struct file_ra_state *ra, +			      u64 start, u32 len, u32 extent_thresh, +			      u64 newer_than, bool do_compress, +			      unsigned long *sectors_defragged, +			      unsigned long max_sectors) +{ +	const u32 sectorsize = inode->root->fs_info->sectorsize; +	struct defrag_target_range *entry; +	struct defrag_target_range *tmp; +	LIST_HEAD(target_list); +	int ret; -	if (i_done != page_cnt) { -		spin_lock(&BTRFS_I(inode)->lock); -		btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); -		spin_unlock(&BTRFS_I(inode)->lock); -		btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, -				start, (page_cnt - i_done) << PAGE_SHIFT, true); -	} +	BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); +	ret = defrag_collect_targets(inode, start, len, extent_thresh, +				     newer_than, do_compress, false, +				     &target_list); +	if (ret < 0) +		goto out; +	list_for_each_entry(entry, &target_list, list) { +		u32 range_len = entry->len; -	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, -			  &cached_state); +		/* Reached the limit */ +		if (max_sectors && max_sectors == *sectors_defragged) +			break; -	unlock_extent_cached(&BTRFS_I(inode)->io_tree, -			     page_start, page_end - 1, &cached_state); +		if (max_sectors) +			range_len = min_t(u32, range_len, +				(max_sectors - *sectors_defragged) * sectorsize); -	for (i = 0; i < i_done; i++) { -		clear_page_dirty_for_io(pages[i]); -		ClearPageChecked(pages[i]); -		set_page_dirty(pages[i]); -		unlock_page(pages[i]); -		put_page(pages[i]); +		if (ra) +			page_cache_sync_readahead(inode->vfs_inode.i_mapping, +				ra, NULL, entry->start >> PAGE_SHIFT, +				((entry->start + range_len - 1) >> PAGE_SHIFT) - +				(entry->start >> PAGE_SHIFT) + 1); +		/* +		 * Here we may not defrag any range if holes are punched before +		 * we locked the pages. +		 * But that's fine, it only affects the @sectors_defragged +		 * accounting. +		 */ +		ret = defrag_one_range(inode, entry->start, range_len, +				       extent_thresh, newer_than, do_compress); +		if (ret < 0) +			break; +		*sectors_defragged += range_len;  	} -	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); -	extent_changeset_free(data_reserved); -	return i_done; - -out_unlock_range: -	unlock_extent_cached(&BTRFS_I(inode)->io_tree, -			     page_start, page_end - 1, &cached_state);  out: -	for (i = 0; i < i_done; i++) { -		unlock_page(pages[i]); -		put_page(pages[i]); +	list_for_each_entry_safe(entry, tmp, &target_list, list) { +		list_del_init(&entry->list); +		kfree(entry);  	} -	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, -			start, page_cnt << PAGE_SHIFT, true); -	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); -	extent_changeset_free(data_reserved);  	return ret; -  } -int btrfs_defrag_file(struct inode *inode, struct file *file, +/* + * Entry point to file defragmentation. + * + * @inode:	   inode to be defragged + * @ra:		   readahead state (can be NUL) + * @range:	   defrag options including range and flags + * @newer_than:	   minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + *		   will be defragged. + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,  		      struct btrfs_ioctl_defrag_range_args *range,  		      u64 newer_than, unsigned long max_to_defrag)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); -	struct btrfs_root *root = BTRFS_I(inode)->root; -	struct file_ra_state *ra = NULL; -	unsigned long last_index; +	unsigned long sectors_defragged = 0;  	u64 isize = i_size_read(inode); -	u64 last_len = 0; -	u64 skip = 0; -	u64 defrag_end = 0; -	u64 newer_off = range->start; -	unsigned long i; -	unsigned long ra_index = 0; -	int ret; -	int defrag_count = 0; +	u64 cur; +	u64 last_byte; +	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; +	bool ra_allocated = false;  	int compress_type = BTRFS_COMPRESS_ZLIB; +	int ret = 0;  	u32 extent_thresh = range->extent_thresh; -	unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; -	unsigned long cluster = max_cluster; -	u64 new_align = ~((u64)SZ_128K - 1); -	struct page **pages = NULL; -	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;  	if (isize == 0)  		return 0; @@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,  	if (extent_thresh == 0)  		extent_thresh = SZ_256K; +	if (range->start + range->len > range->start) { +		/* Got a specific range */ +		last_byte = min(isize, range->start + range->len) - 1; +	} else { +		/* Defrag until file end */ +		last_byte = isize - 1; +	} +  	/* -	 * If we were not given a file, allocate a readahead context. As +	 * If we were not given a ra, allocate a readahead context. As  	 * readahead is just an optimization, defrag will work without it so  	 * we don't error out.  	 */ -	if (!file) { +	if (!ra) { +		ra_allocated = true;  		ra = kzalloc(sizeof(*ra), GFP_KERNEL);  		if (ra)  			file_ra_state_init(ra, inode->i_mapping); -	} else { -		ra = &file->f_ra; -	} - -	pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); -	if (!pages) { -		ret = -ENOMEM; -		goto out_ra; -	} - -	/* find the last page to defrag */ -	if (range->start + range->len > range->start) { -		last_index = min_t(u64, isize - 1, -			 range->start + range->len - 1) >> PAGE_SHIFT; -	} else { -		last_index = (isize - 1) >> PAGE_SHIFT; -	} - -	if (newer_than) { -		ret = find_new_extents(root, inode, newer_than, -				       &newer_off, SZ_64K); -		if (!ret) { -			range->start = newer_off; -			/* -			 * we always align our defrag to help keep -			 * the extents in the file evenly spaced -			 */ -			i = (newer_off & new_align) >> PAGE_SHIFT; -		} else -			goto out_ra; -	} else { -		i = range->start >> PAGE_SHIFT;  	} -	if (!max_to_defrag) -		max_to_defrag = last_index - i + 1; - -	/* -	 * make writeback starts from i, so the defrag range can be -	 * written sequentially. -	 */ -	if (i < inode->i_mapping->writeback_index) -		inode->i_mapping->writeback_index = i; - -	while (i <= last_index && defrag_count < max_to_defrag && -	       (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { -		/* -		 * make sure we stop running if someone unmounts -		 * the FS -		 */ -		if (!(inode->i_sb->s_flags & SB_ACTIVE)) -			break; -		if (btrfs_defrag_cancelled(fs_info)) { -			btrfs_debug(fs_info, "defrag_file cancelled"); -			ret = -EAGAIN; -			goto error; -		} +	/* Align the range */ +	cur = round_down(range->start, fs_info->sectorsize); +	last_byte = round_up(last_byte, fs_info->sectorsize) - 1; -		if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, -					 extent_thresh, &last_len, &skip, -					 &defrag_end, do_compress)){ -			unsigned long next; -			/* -			 * the should_defrag function tells us how much to skip -			 * bump our counter by the suggested amount -			 */ -			next = DIV_ROUND_UP(skip, PAGE_SIZE); -			i = max(i + 1, next); -			continue; -		} +	while (cur < last_byte) { +		u64 cluster_end; -		if (!newer_than) { -			cluster = (PAGE_ALIGN(defrag_end) >> -				   PAGE_SHIFT) - i; -			cluster = min(cluster, max_cluster); -		} else { -			cluster = max_cluster; -		} +		/* The cluster size 256K should always be page aligned */ +		BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); -		if (i + cluster > ra_index) { -			ra_index = max(i, ra_index); -			if (ra) -				page_cache_sync_readahead(inode->i_mapping, ra, -						file, ra_index, cluster); -			ra_index += cluster; -		} +		/* We want the cluster end at page boundary when possible */ +		cluster_end = (((cur >> PAGE_SHIFT) + +			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; +		cluster_end = min(cluster_end, last_byte);  		btrfs_inode_lock(inode, 0);  		if (IS_SWAPFILE(inode)) {  			ret = -ETXTBSY; -		} else { -			if (do_compress) -				BTRFS_I(inode)->defrag_compress = compress_type; -			ret = cluster_pages_for_defrag(inode, pages, i, cluster); +			btrfs_inode_unlock(inode, 0); +			break;  		} -		if (ret < 0) { +		if (!(inode->i_sb->s_flags & SB_ACTIVE)) {  			btrfs_inode_unlock(inode, 0); -			goto out_ra; +			break;  		} - -		defrag_count += ret; -		balance_dirty_pages_ratelimited(inode->i_mapping); +		if (do_compress) +			BTRFS_I(inode)->defrag_compress = compress_type; +		ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, +				cluster_end + 1 - cur, extent_thresh, +				newer_than, do_compress, +				§ors_defragged, max_to_defrag);  		btrfs_inode_unlock(inode, 0); - -		if (newer_than) { -			if (newer_off == (u64)-1) -				break; - -			if (ret > 0) -				i += ret; - -			newer_off = max(newer_off + 1, -					(u64)i << PAGE_SHIFT); - -			ret = find_new_extents(root, inode, newer_than, -					       &newer_off, SZ_64K); -			if (!ret) { -				range->start = newer_off; -				i = (newer_off & new_align) >> PAGE_SHIFT; -			} else { -				break; -			} -		} else { -			if (ret > 0) { -				i += ret; -				last_len += ret << PAGE_SHIFT; -			} else { -				i++; -				last_len = 0; -			} -		} +		if (ret < 0) +			break; +		cur = cluster_end + 1;  	} -	ret = defrag_count; -error: -	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { -		filemap_flush(inode->i_mapping); -		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, -			     &BTRFS_I(inode)->runtime_flags)) +	if (ra_allocated) +		kfree(ra); +	if (sectors_defragged) { +		/* +		 * We have defragged some sectors, for compression case they +		 * need to be written back immediately. +		 */ +		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {  			filemap_flush(inode->i_mapping); +			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, +				     &BTRFS_I(inode)->runtime_flags)) +				filemap_flush(inode->i_mapping); +		} +		if (range->compress_type == BTRFS_COMPRESS_LZO) +			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); +		else if (range->compress_type == BTRFS_COMPRESS_ZSTD) +			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); +		ret = sectors_defragged;  	} - -	if (range->compress_type == BTRFS_COMPRESS_LZO) { -		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); -	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { -		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); -	} - -out_ra:  	if (do_compress) {  		btrfs_inode_lock(inode, 0);  		BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;  		btrfs_inode_unlock(inode, 0);  	} -	if (!file) -		kfree(ra); -	kfree(pages);  	return ret;  } @@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,  static noinline int btrfs_ioctl_resize(struct file *file,  					void __user *arg)  { +	BTRFS_DEV_LOOKUP_ARGS(args);  	struct inode *inode = file_inode(file);  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	u64 new_size; @@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,  		btrfs_info(fs_info, "resizing devid %llu", devid);  	} -	device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); +	args.devid = devid; +	device = btrfs_find_device(fs_info->fs_devices, &args);  	if (!device) {  		btrfs_info(fs_info, "resizer unable to find device %llu",  			   devid); @@ -1730,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,  	}  	if (!strcmp(sizestr, "max")) -		new_size = device->bdev->bd_inode->i_size; +		new_size = bdev_nr_bytes(device->bdev);  	else {  		if (sizestr[0] == '-') {  			mod = -1; @@ -1771,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,  		ret = -EINVAL;  		goto out_finish;  	} -	if (new_size > device->bdev->bd_inode->i_size) { +	if (new_size > bdev_nr_bytes(device->bdev)) {  		ret = -EFBIG;  		goto out_finish;  	} @@ -2261,9 +2222,8 @@ static noinline int search_ioctl(struct inode *inode,  	key.offset = sk->min_offset;  	while (1) { -		ret = fault_in_pages_writeable(ubuf + sk_offset, -					       *buf_size - sk_offset); -		if (ret) +		ret = -EFAULT; +		if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))  			break;  		ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -3136,12 +3096,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  		goto out;  	} -	/* Subpage defrag will be supported in later commits */ -	if (root->fs_info->sectorsize < PAGE_SIZE) { -		ret = -ENOTTY; -		goto out; -	} -  	switch (inode->i_mode & S_IFMT) {  	case S_IFDIR:  		if (!capable(CAP_SYS_ADMIN)) { @@ -3176,7 +3130,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)  			/* the rest are all set to zero by kzalloc */  			range.len = (u64)-1;  		} -		ret = btrfs_defrag_file(file_inode(file), file, +		ret = btrfs_defrag_file(file_inode(file), &file->f_ra,  					&range, BTRFS_OLDEST_GENERATION, 0);  		if (ret > 0)  			ret = 0; @@ -3220,6 +3174,7 @@ out:  static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)  { +	BTRFS_DEV_LOOKUP_ARGS(args);  	struct inode *inode = file_inode(file);  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	struct btrfs_ioctl_vol_args_v2 *vol_args; @@ -3231,35 +3186,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	ret = mnt_want_write_file(file); -	if (ret) -		return ret; -  	vol_args = memdup_user(arg, sizeof(*vol_args));  	if (IS_ERR(vol_args)) {  		ret = PTR_ERR(vol_args); -		goto err_drop; +		goto out;  	}  	if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {  		ret = -EOPNOTSUPP;  		goto out;  	} +  	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; -	if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && -	    strcmp("cancel", vol_args->name) == 0) +	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { +		args.devid = vol_args->devid; +	} else if (!strcmp("cancel", vol_args->name)) {  		cancel = true; +	} else { +		ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); +		if (ret) +			goto out; +	} + +	ret = mnt_want_write_file(file); +	if (ret) +		goto out;  	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,  					   cancel);  	if (ret) -		goto out; -	/* Exclusive operation is now claimed */ +		goto err_drop; -	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) -		ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode); -	else -		ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); +	/* Exclusive operation is now claimed */ +	ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);  	btrfs_exclop_finish(fs_info); @@ -3271,17 +3230,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)  			btrfs_info(fs_info, "device deleted: %s",  					vol_args->name);  	} -out: -	kfree(vol_args);  err_drop:  	mnt_drop_write_file(file);  	if (bdev)  		blkdev_put(bdev, mode); +out: +	btrfs_put_dev_args_from_path(&args); +	kfree(vol_args);  	return ret;  }  static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)  { +	BTRFS_DEV_LOOKUP_ARGS(args);  	struct inode *inode = file_inode(file);  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	struct btrfs_ioctl_vol_args *vol_args; @@ -3293,32 +3254,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; -	ret = mnt_want_write_file(file); -	if (ret) -		return ret; -  	vol_args = memdup_user(arg, sizeof(*vol_args)); -	if (IS_ERR(vol_args)) { -		ret = PTR_ERR(vol_args); -		goto out_drop_write; -	} +	if (IS_ERR(vol_args)) +		return PTR_ERR(vol_args); +  	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; -	cancel = (strcmp("cancel", vol_args->name) == 0); +	if (!strcmp("cancel", vol_args->name)) { +		cancel = true; +	} else { +		ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); +		if (ret) +			goto out; +	} + +	ret = mnt_want_write_file(file); +	if (ret) +		goto out;  	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,  					   cancel);  	if (ret == 0) { -		ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); +		ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);  		if (!ret)  			btrfs_info(fs_info, "disk deleted %s", vol_args->name);  		btrfs_exclop_finish(fs_info);  	} -	kfree(vol_args); -out_drop_write:  	mnt_drop_write_file(file);  	if (bdev)  		blkdev_put(bdev, mode); +out: +	btrfs_put_dev_args_from_path(&args); +	kfree(vol_args);  	return ret;  } @@ -3379,22 +3346,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,  static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,  				 void __user *arg)  { +	BTRFS_DEV_LOOKUP_ARGS(args);  	struct btrfs_ioctl_dev_info_args *di_args;  	struct btrfs_device *dev;  	int ret = 0; -	char *s_uuid = NULL;  	di_args = memdup_user(arg, sizeof(*di_args));  	if (IS_ERR(di_args))  		return PTR_ERR(di_args); +	args.devid = di_args->devid;  	if (!btrfs_is_empty_uuid(di_args->uuid)) -		s_uuid = di_args->uuid; +		args.uuid = di_args->uuid;  	rcu_read_lock(); -	dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, -				NULL); - +	dev = btrfs_find_device(fs_info->fs_devices, &args);  	if (!dev) {  		ret = -ENODEV;  		goto out; @@ -4430,7 +4396,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,  						void __user *arg)  {  	struct btrfs_ioctl_quota_rescan_args qsa = {0}; -	int ret = 0;  	if (!capable(CAP_SYS_ADMIN))  		return -EPERM; @@ -4441,9 +4406,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,  	}  	if (copy_to_user(arg, &qsa, sizeof(qsa))) -		ret = -EFAULT; +		return -EFAULT; -	return ret; +	return 0;  }  static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, |