diff options
Diffstat (limited to 'fs/btrfs/inode.c')
| -rw-r--r-- | fs/btrfs/inode.c | 320 | 
1 files changed, 220 insertions, 100 deletions
| diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0fa7253a2d7..a70c5790f8f5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,  	btrfs_delalloc_release_metadata(inode, end + 1 - start);  	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);  out: +	/* +	 * Don't forget to free the reserved space, as for inlined extent +	 * it won't count as data extent, free them directly here. +	 * And at reserve time, it's always aligned to page size, so +	 * just free one page here. +	 */ +	btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);  	btrfs_free_path(path);  	btrfs_end_transaction(trans, root);  	return ret; @@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)  	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>  		PAGE_CACHE_SHIFT; +	/* +	 * atomic_sub_return implies a barrier for waitqueue_active +	 */  	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <  	    5 * 1024 * 1024 &&  	    waitqueue_active(&root->fs_info->async_submit_wait)) @@ -1294,8 +1304,14 @@ next_slot:  		num_bytes = 0;  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); -		if (found_key.objectid > ino || -		    found_key.type > BTRFS_EXTENT_DATA_KEY || +		if (found_key.objectid > ino) +			break; +		if (WARN_ON_ONCE(found_key.objectid < ino) || +		    found_key.type < BTRFS_EXTENT_DATA_KEY) { +			path->slots[0]++; +			goto next_slot; +		} +		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||  		    found_key.offset > end)  			break; @@ -1766,7 +1782,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,  		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID  		    && do_list && !(state->state & EXTENT_NORESERVE)) -			btrfs_free_reserved_data_space(inode, len); +			btrfs_free_reserved_data_space_noquota(inode, +					state->start, len);  		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,  				     root->fs_info->delalloc_batch); @@ -1861,15 +1878,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,  			  u64 bio_offset)  {  	struct btrfs_root *root = BTRFS_I(inode)->root; +	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;  	int ret = 0;  	int skip_sum; -	int metadata = 0;  	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);  	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;  	if (btrfs_is_free_space_inode(inode)) -		metadata = 2; +		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;  	if (!(rw & REQ_WRITE)) {  		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); @@ -1989,7 +2006,8 @@ again:  		goto again;  	} -	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); +	ret = btrfs_delalloc_reserve_space(inode, page_start, +					   PAGE_CACHE_SIZE);  	if (ret) {  		mapping_set_error(page->mapping, ret);  		end_extent_writepage(page, ret, page_start, page_end); @@ -2115,7 +2133,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,  	ins.type = BTRFS_EXTENT_ITEM_KEY;  	ret = btrfs_alloc_reserved_file_extent(trans, root,  					root->root_key.objectid, -					btrfs_ino(inode), file_pos, &ins); +					btrfs_ino(inode), file_pos, +					ram_bytes, &ins); +	/* +	 * Release the reserved range from inode dirty range map, as it is +	 * already moved into delayed_ref_head +	 */ +	btrfs_qgroup_release_data(inode, file_pos, ram_bytes);  out:  	btrfs_free_path(path); @@ -2573,7 +2597,7 @@ again:  	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,  			new->disk_len, 0,  			backref->root_id, backref->inum, -			new->file_pos, 0);	/* start - extent_offset */ +			new->file_pos);	/* start - extent_offset */  	if (ret) {  		btrfs_abort_transaction(trans, root, ret);  		goto out_free_path; @@ -2599,7 +2623,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)  		return;  	list_for_each_entry_safe(old, tmp, &new->head, list) { -		list_del(&old->list);  		kfree(old);  	}  	kfree(new); @@ -2824,6 +2847,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {  		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + +		/* +		 * For mwrite(mmap + memset to write) case, we still reserve +		 * space for NOCOW range. +		 * As NOCOW won't cause a new delayed ref, just free the space +		 */ +		btrfs_qgroup_free_data(inode, ordered_extent->file_offset, +				       ordered_extent->len);  		btrfs_ordered_update_i_size(inode, 0, ordered_extent);  		if (nolock)  			trans = btrfs_join_transaction_nolock(root); @@ -3018,8 +3049,6 @@ static int __readpage_endio_check(struct inode *inode,  	char *kaddr;  	u32 csum_expected;  	u32 csum = ~(u32)0; -	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, -				      DEFAULT_RATELIMIT_BURST);  	csum_expected = *(((u32 *)io_bio->csum) + icsum); @@ -3032,9 +3061,8 @@ static int __readpage_endio_check(struct inode *inode,  	kunmap_atomic(kaddr);  	return 0;  zeroit: -	if (__ratelimit(&_rs)) -		btrfs_warn(BTRFS_I(inode)->root->fs_info, -			   "csum failed ino %llu off %llu csum %u expected csum %u", +	btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, +		"csum failed ino %llu off %llu csum %u expected csum %u",  			   btrfs_ino(inode), start, csum, csum_expected);  	memset(kaddr + pgoff, 1, len);  	flush_dcache_page(page); @@ -4018,9 +4046,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,   */  static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)  { -	struct btrfs_trans_handle *trans;  	struct btrfs_root *root = BTRFS_I(dir)->root; -	int ret;  	/*  	 * 1 for the possible orphan item @@ -4029,27 +4055,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)  	 * 1 for the inode ref  	 * 1 for the inode  	 */ -	trans = btrfs_start_transaction(root, 5); -	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) -		return trans; - -	if (PTR_ERR(trans) == -ENOSPC) { -		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - -		trans = btrfs_start_transaction(root, 0); -		if (IS_ERR(trans)) -			return trans; -		ret = btrfs_cond_migrate_bytes(root->fs_info, -					       &root->fs_info->trans_block_rsv, -					       num_bytes, 5); -		if (ret) { -			btrfs_end_transaction(trans, root); -			return ERR_PTR(ret); -		} -		trans->block_rsv = &root->fs_info->trans_block_rsv; -		trans->bytes_reserved = num_bytes; -	} -	return trans; +	return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);  }  static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4217,6 +4223,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,  } +static int truncate_inline_extent(struct inode *inode, +				  struct btrfs_path *path, +				  struct btrfs_key *found_key, +				  const u64 item_end, +				  const u64 new_size) +{ +	struct extent_buffer *leaf = path->nodes[0]; +	int slot = path->slots[0]; +	struct btrfs_file_extent_item *fi; +	u32 size = (u32)(new_size - found_key->offset); +	struct btrfs_root *root = BTRFS_I(inode)->root; + +	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + +	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { +		loff_t offset = new_size; +		loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + +		/* +		 * Zero out the remaining of the last page of our inline extent, +		 * instead of directly truncating our inline extent here - that +		 * would be much more complex (decompressing all the data, then +		 * compressing the truncated data, which might be bigger than +		 * the size of the inline extent, resize the extent, etc). +		 * We release the path because to get the page we might need to +		 * read the extent item from disk (data not in the page cache). +		 */ +		btrfs_release_path(path); +		return btrfs_truncate_page(inode, offset, page_end - offset, 0); +	} + +	btrfs_set_file_extent_ram_bytes(leaf, fi, size); +	size = btrfs_file_extent_calc_inline_size(size); +	btrfs_truncate_item(root, path, size, 1); + +	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) +		inode_sub_bytes(inode, item_end + 1 - new_size); + +	return 0; +} +  /*   * this can truncate away extent items, csum items and directory items.   * It starts at a high offset and removes keys until it can't find @@ -4411,27 +4458,40 @@ search_again:  			 * special encodings  			 */  			if (!del_item && -			    btrfs_file_extent_compression(leaf, fi) == 0 &&  			    btrfs_file_extent_encryption(leaf, fi) == 0 &&  			    btrfs_file_extent_other_encoding(leaf, fi) == 0) { -				u32 size = new_size - found_key.offset; - -				if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) -					inode_sub_bytes(inode, item_end + 1 - -							new_size);  				/* -				 * update the ram bytes to properly reflect -				 * the new size of our item +				 * Need to release path in order to truncate a +				 * compressed extent. So delete any accumulated +				 * extent items so far.  				 */ -				btrfs_set_file_extent_ram_bytes(leaf, fi, size); -				size = -				    btrfs_file_extent_calc_inline_size(size); -				btrfs_truncate_item(root, path, size, 1); +				if (btrfs_file_extent_compression(leaf, fi) != +				    BTRFS_COMPRESS_NONE && pending_del_nr) { +					err = btrfs_del_items(trans, root, path, +							      pending_del_slot, +							      pending_del_nr); +					if (err) { +						btrfs_abort_transaction(trans, +									root, +									err); +						goto error; +					} +					pending_del_nr = 0; +				} + +				err = truncate_inline_extent(inode, path, +							     &found_key, +							     item_end, +							     new_size); +				if (err) { +					btrfs_abort_transaction(trans, +								root, err); +					goto error; +				}  			} else if (test_bit(BTRFS_ROOT_REF_COWS,  					    &root->state)) { -				inode_sub_bytes(inode, item_end + 1 - -						found_key.offset); +				inode_sub_bytes(inode, item_end + 1 - new_size);  			}  		}  delete: @@ -4461,7 +4521,7 @@ delete:  			ret = btrfs_free_extent(trans, root, extent_start,  						extent_num_bytes, 0,  						btrfs_header_owner(leaf), -						ino, extent_offset, 0); +						ino, extent_offset);  			BUG_ON(ret);  			if (btrfs_should_throttle_delayed_refs(trans, root))  				btrfs_async_run_delayed_refs(root, @@ -4575,14 +4635,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,  	if ((offset & (blocksize - 1)) == 0 &&  	    (!len || ((len & (blocksize - 1)) == 0)))  		goto out; -	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); +	ret = btrfs_delalloc_reserve_space(inode, +			round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);  	if (ret)  		goto out;  again:  	page = find_or_create_page(mapping, index, mask);  	if (!page) { -		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +		btrfs_delalloc_release_space(inode, +				round_down(from, PAGE_CACHE_SIZE), +				PAGE_CACHE_SIZE);  		ret = -ENOMEM;  		goto out;  	} @@ -4650,7 +4713,8 @@ again:  out_unlock:  	if (ret) -		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +		btrfs_delalloc_release_space(inode, page_start, +					     PAGE_CACHE_SIZE);  	unlock_page(page);  	page_cache_release(page);  out: @@ -5048,6 +5112,18 @@ static void evict_inode_truncate_pages(struct inode *inode)  		spin_unlock(&io_tree->lock);  		lock_extent_bits(io_tree, start, end, 0, &cached_state); + +		/* +		 * If still has DELALLOC flag, the extent didn't reach disk, +		 * and its reserved space won't be freed by delayed_ref. +		 * So we need to free its reserved space here. +		 * (Refer to comment in btrfs_invalidatepage, case 2) +		 * +		 * Note, end is the bytenr of last byte, so we need + 1 here. +		 */ +		if (state->state & EXTENT_DELALLOC) +			btrfs_qgroup_free_data(inode, start, end - start + 1); +  		clear_extent_bit(io_tree, start, end,  				 EXTENT_LOCKED | EXTENT_DIRTY |  				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | @@ -5084,7 +5160,8 @@ void btrfs_evict_inode(struct inode *inode)  		goto no_delete;  	}  	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ -	btrfs_wait_ordered_range(inode, 0, (u64)-1); +	if (!special_file(inode->i_mode)) +		btrfs_wait_ordered_range(inode, 0, (u64)-1);  	btrfs_free_io_failure_record(inode, 0, (u64)-1); @@ -6267,9 +6344,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,  	u64 objectid;  	u64 index = 0; -	if (!new_valid_dev(rdev)) -		return -EINVAL; -  	/*  	 * 2 for inode item and ref  	 * 2 for dir items @@ -7408,6 +7482,32 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,  	return em;  } +struct btrfs_dio_data { +	u64 outstanding_extents; +	u64 reserve; +}; + +static void adjust_dio_outstanding_extents(struct inode *inode, +					   struct btrfs_dio_data *dio_data, +					   const u64 len) +{ +	unsigned num_extents; + +	num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1, +					   BTRFS_MAX_EXTENT_SIZE); +	/* +	 * If we have an outstanding_extents count still set then we're +	 * within our reservation, otherwise we need to adjust our inode +	 * counter appropriately. +	 */ +	if (dio_data->outstanding_extents) { +		dio_data->outstanding_extents -= num_extents; +	} else { +		spin_lock(&BTRFS_I(inode)->lock); +		BTRFS_I(inode)->outstanding_extents += num_extents; +		spin_unlock(&BTRFS_I(inode)->lock); +	} +}  static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  				   struct buffer_head *bh_result, int create) @@ -7415,10 +7515,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	struct extent_map *em;  	struct btrfs_root *root = BTRFS_I(inode)->root;  	struct extent_state *cached_state = NULL; +	struct btrfs_dio_data *dio_data = NULL;  	u64 start = iblock << inode->i_blkbits;  	u64 lockstart, lockend;  	u64 len = bh_result->b_size; -	u64 *outstanding_extents = NULL;  	int unlock_bits = EXTENT_LOCKED;  	int ret = 0; @@ -7436,7 +7536,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  		 * that anything that needs to check if there's a transction doesn't get  		 * confused.  		 */ -		outstanding_extents = current->journal_info; +		dio_data = current->journal_info;  		current->journal_info = NULL;  	} @@ -7444,8 +7544,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,  	 * If this errors out it's because we couldn't invalidate pagecache for  	 * this range and we need to fallback to buffered.  	 */ -	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) -		return -ENOTBLK; +	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, +			       create)) { +		ret = -ENOTBLK; +		goto err; +	}  	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);  	if (IS_ERR(em)) { @@ -7563,22 +7666,11 @@ unlock:  		if (start + len > i_size_read(inode))  			i_size_write(inode, start + len); -		/* -		 * If we have an outstanding_extents count still set then we're -		 * within our reservation, otherwise we need to adjust our inode -		 * counter appropriately. -		 */ -		if (*outstanding_extents) { -			(*outstanding_extents)--; -		} else { -			spin_lock(&BTRFS_I(inode)->lock); -			BTRFS_I(inode)->outstanding_extents++; -			spin_unlock(&BTRFS_I(inode)->lock); -		} - -		current->journal_info = outstanding_extents; -		btrfs_free_reserved_data_space(inode, len); -		set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); +		adjust_dio_outstanding_extents(inode, dio_data, len); +		btrfs_free_reserved_data_space(inode, start, len); +		WARN_ON(dio_data->reserve < len); +		dio_data->reserve -= len; +		current->journal_info = dio_data;  	}  	/* @@ -7601,8 +7693,17 @@ unlock:  unlock_err:  	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,  			 unlock_bits, 1, 0, &cached_state, GFP_NOFS); -	if (outstanding_extents) -		current->journal_info = outstanding_extents; +err: +	if (dio_data) +		current->journal_info = dio_data; +	/* +	 * Compensate the delalloc release we do in btrfs_direct_IO() when we +	 * write less data then expected, so that we don't underflow our inode's +	 * outstanding extents counter. +	 */ +	if (create && dio_data) +		adjust_dio_outstanding_extents(inode, dio_data, len); +  	return ret;  } @@ -8329,7 +8430,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,  {  	struct file *file = iocb->ki_filp;  	struct inode *inode = file->f_mapping->host; -	u64 outstanding_extents = 0; +	struct btrfs_root *root = BTRFS_I(inode)->root; +	struct btrfs_dio_data dio_data = { 0 };  	size_t count = 0;  	int flags = 0;  	bool wakeup = true; @@ -8364,10 +8466,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,  			mutex_unlock(&inode->i_mutex);  			relock = true;  		} -		ret = btrfs_delalloc_reserve_space(inode, count); +		ret = btrfs_delalloc_reserve_space(inode, offset, count);  		if (ret)  			goto out; -		outstanding_extents = div64_u64(count + +		dio_data.outstanding_extents = div64_u64(count +  						BTRFS_MAX_EXTENT_SIZE - 1,  						BTRFS_MAX_EXTENT_SIZE); @@ -8376,7 +8478,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,  		 * do the accounting properly if we go over the number we  		 * originally calculated.  Abuse current->journal_info for this.  		 */ -		current->journal_info = &outstanding_extents; +		dio_data.reserve = round_up(count, root->sectorsize); +		current->journal_info = &dio_data;  	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,  				     &BTRFS_I(inode)->runtime_flags)) {  		inode_dio_end(inode); @@ -8391,18 +8494,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,  	if (iov_iter_rw(iter) == WRITE) {  		current->journal_info = NULL;  		if (ret < 0 && ret != -EIOCBQUEUED) { -			/* -			 * If the error comes from submitting stage, -			 * btrfs_get_blocsk_direct() has free'd data space, -			 * and metadata space will be handled by -			 * finish_ordered_fn, don't do that again to make -			 * sure bytes_may_use is correct. -			 */ -			if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, -				     &BTRFS_I(inode)->runtime_flags)) -				btrfs_delalloc_release_space(inode, count); +			if (dio_data.reserve) +				btrfs_delalloc_release_space(inode, offset, +							     dio_data.reserve);  		} else if (ret >= 0 && (size_t)ret < count) -			btrfs_delalloc_release_space(inode, +			btrfs_delalloc_release_space(inode, offset,  						     count - (size_t)ret);  	}  out: @@ -8561,6 +8657,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,  		}  	} +	/* +	 * Qgroup reserved space handler +	 * Page here will be either +	 * 1) Already written to disk +	 *    In this case, its reserved space is released from data rsv map +	 *    and will be freed by delayed_ref handler finally. +	 *    So even we call qgroup_free_data(), it won't decrease reserved +	 *    space. +	 * 2) Not written to disk +	 *    This means the reserved space should be freed here. +	 */ +	btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);  	if (!inode_evicting) {  		clear_extent_bit(tree, page_start, page_end,  				 EXTENT_LOCKED | EXTENT_DIRTY | @@ -8611,7 +8719,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  	u64 page_end;  	sb_start_pagefault(inode->i_sb); -	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); +	page_start = page_offset(page); +	page_end = page_start + PAGE_CACHE_SIZE - 1; + +	ret = btrfs_delalloc_reserve_space(inode, page_start, +					   PAGE_CACHE_SIZE);  	if (!ret) {  		ret = file_update_time(vma->vm_file);  		reserved = 1; @@ -8630,8 +8742,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)  again:  	lock_page(page);  	size = i_size_read(inode); -	page_start = page_offset(page); -	page_end = page_start + PAGE_CACHE_SIZE - 1;  	if ((page->mapping != inode->i_mapping) ||  	    (page_start >= size)) { @@ -8708,7 +8818,7 @@ out_unlock:  	}  	unlock_page(page);  out: -	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +	btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);  out_noreserve:  	sb_end_pagefault(inode->i_sb);  	return ret; @@ -8997,6 +9107,7 @@ void btrfs_destroy_inode(struct inode *inode)  			btrfs_put_ordered_extent(ordered);  		}  	} +	btrfs_qgroup_check_reserved_leak(inode);  	inode_tree_del(inode);  	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);  free: @@ -9633,6 +9744,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  	u64 cur_offset = start;  	u64 i_size;  	u64 cur_bytes; +	u64 last_alloc = (u64)-1;  	int ret = 0;  	bool own_trans = true; @@ -9649,6 +9761,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);  		cur_bytes = max(cur_bytes, min_size); +		/* +		 * If we are severely fragmented we could end up with really +		 * small allocations, so if the allocator is returning small +		 * chunks lets make its job easier by only searching for those +		 * sized chunks. +		 */ +		cur_bytes = min(cur_bytes, last_alloc);  		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,  					   *alloc_hint, &ins, 1, 0);  		if (ret) { @@ -9657,6 +9776,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,  			break;  		} +		last_alloc = ins.offset;  		ret = insert_reserved_file_extent(trans, inode,  						  cur_offset, ins.objectid,  						  ins.offset, ins.offset, |