diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
| -rw-r--r-- | fs/btrfs/extent_io.c | 286 | 
1 files changed, 161 insertions, 125 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 68c96057ad2d..6def411b2eba 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1999,7 +1999,8 @@ static int __process_pages_contig(struct address_space *mapping,  				if (!PageDirty(pages[i]) ||  				    pages[i]->mapping != mapping) {  					unlock_page(pages[i]); -					put_page(pages[i]); +					for (; i < ret; i++) +						put_page(pages[i]);  					err = -EAGAIN;  					goto out;  				} @@ -2017,15 +2018,14 @@ out:  	return err;  } -void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,  				  struct page *locked_page,  				  unsigned clear_bits,  				  unsigned long page_ops)  { -	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, -			 NULL); +	clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); -	__process_pages_contig(inode->i_mapping, locked_page, +	__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,  			       start >> PAGE_SHIFT, end >> PAGE_SHIFT,  			       page_ops, NULL);  } @@ -2122,12 +2122,11 @@ out:  	return ret;  } -int get_state_failrec(struct extent_io_tree *tree, u64 start, -		      struct io_failure_record **failrec) +struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)  {  	struct rb_node *node;  	struct extent_state *state; -	int ret = 0; +	struct io_failure_record *failrec;  	spin_lock(&tree->lock);  	/* @@ -2136,18 +2135,19 @@ int get_state_failrec(struct extent_io_tree *tree, u64 start,  	 */  	node = tree_search(tree, start);  	if (!node) { -		ret = -ENOENT; +		failrec = ERR_PTR(-ENOENT);  		goto out;  	}  	state = rb_entry(node, struct extent_state, rb_node);  	if (state->start != start) { -		ret = -ENOENT; +		failrec = ERR_PTR(-ENOENT);  		goto out;  	} -	*failrec = state->failrec; + +	failrec = state->failrec;  out:  	spin_unlock(&tree->lock); -	return ret; +	return failrec;  }  /* @@ -2377,8 +2377,8 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,  	if (!ret)  		return 0; -	ret = get_state_failrec(failure_tree, start, &failrec); -	if (ret) +	failrec = get_state_failrec(failure_tree, start); +	if (IS_ERR(failrec))  		return 0;  	BUG_ON(!failrec->this_mirror); @@ -2450,8 +2450,8 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)  	spin_unlock(&failure_tree->lock);  } -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, -		struct io_failure_record **failrec_ret) +static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, +							     u64 start, u64 end)  {  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  	struct io_failure_record *failrec; @@ -2462,65 +2462,8 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,  	int ret;  	u64 logical; -	ret = get_state_failrec(failure_tree, start, &failrec); -	if (ret) { -		failrec = kzalloc(sizeof(*failrec), GFP_NOFS); -		if (!failrec) -			return -ENOMEM; - -		failrec->start = start; -		failrec->len = end - start + 1; -		failrec->this_mirror = 0; -		failrec->bio_flags = 0; -		failrec->in_validation = 0; - -		read_lock(&em_tree->lock); -		em = lookup_extent_mapping(em_tree, start, failrec->len); -		if (!em) { -			read_unlock(&em_tree->lock); -			kfree(failrec); -			return -EIO; -		} - -		if (em->start > start || em->start + em->len <= start) { -			free_extent_map(em); -			em = NULL; -		} -		read_unlock(&em_tree->lock); -		if (!em) { -			kfree(failrec); -			return -EIO; -		} - -		logical = start - em->start; -		logical = em->block_start + logical; -		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { -			logical = em->block_start; -			failrec->bio_flags = EXTENT_BIO_COMPRESSED; -			extent_set_compress_type(&failrec->bio_flags, -						 em->compress_type); -		} - -		btrfs_debug(fs_info, -			"Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", -			logical, start, failrec->len); - -		failrec->logical = logical; -		free_extent_map(em); - -		/* set the bits in the private failure tree */ -		ret = set_extent_bits(failure_tree, start, end, -					EXTENT_LOCKED | EXTENT_DIRTY); -		if (ret >= 0) -			ret = set_state_failrec(failure_tree, start, failrec); -		/* set the bits in the inode's tree */ -		if (ret >= 0) -			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); -		if (ret < 0) { -			kfree(failrec); -			return ret; -		} -	} else { +	failrec = get_state_failrec(failure_tree, start); +	if (!IS_ERR(failrec)) {  		btrfs_debug(fs_info,  			"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",  			failrec->logical, failrec->start, failrec->len, @@ -2530,11 +2473,66 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,  		 * (e.g. with a list for failed_mirror) to make  		 * clean_io_failure() clean all those errors at once.  		 */ + +		return failrec;  	} -	*failrec_ret = failrec; +	failrec = kzalloc(sizeof(*failrec), GFP_NOFS); +	if (!failrec) +		return ERR_PTR(-ENOMEM); + +	failrec->start = start; +	failrec->len = end - start + 1; +	failrec->this_mirror = 0; +	failrec->bio_flags = 0; +	failrec->in_validation = 0; -	return 0; +	read_lock(&em_tree->lock); +	em = lookup_extent_mapping(em_tree, start, failrec->len); +	if (!em) { +		read_unlock(&em_tree->lock); +		kfree(failrec); +		return ERR_PTR(-EIO); +	} + +	if (em->start > start || em->start + em->len <= start) { +		free_extent_map(em); +		em = NULL; +	} +	read_unlock(&em_tree->lock); +	if (!em) { +		kfree(failrec); +		return ERR_PTR(-EIO); +	} + +	logical = start - em->start; +	logical = em->block_start + logical; +	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { +		logical = em->block_start; +		failrec->bio_flags = EXTENT_BIO_COMPRESSED; +		extent_set_compress_type(&failrec->bio_flags, em->compress_type); +	} + +	btrfs_debug(fs_info, +		    "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", +		    logical, start, failrec->len); + +	failrec->logical = logical; +	free_extent_map(em); + +	/* Set the bits in the private failure tree */ +	ret = set_extent_bits(failure_tree, start, end, +			      EXTENT_LOCKED | EXTENT_DIRTY); +	if (ret >= 0) { +		ret = set_state_failrec(failure_tree, start, failrec); +		/* Set the bits in the inode's tree */ +		ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); +	} else if (ret < 0) { +		kfree(failrec); +		return ERR_PTR(ret); +	} + +	return failrec;  }  static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, @@ -2659,16 +2657,15 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,  	struct bio *repair_bio;  	struct btrfs_io_bio *repair_io_bio;  	blk_status_t status; -	int ret;  	btrfs_debug(fs_info,  		   "repair read error: read error at %llu", start);  	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); -	ret = btrfs_get_io_failure_record(inode, start, end, &failrec); -	if (ret) -		return errno_to_blk_status(ret); +	failrec = btrfs_get_io_failure_record(inode, start, end); +	if (IS_ERR(failrec)) +		return errno_to_blk_status(PTR_ERR(failrec));  	need_validation = btrfs_io_needs_validation(inode, failed_bio); @@ -3419,7 +3416,7 @@ static void update_nr_written(struct writeback_control *wbc,   * This returns 0 if all went well (page still locked)   * This returns < 0 if there were errors (page still locked)   */ -static noinline_for_stack int writepage_delalloc(struct inode *inode, +static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,  		struct page *page, struct writeback_control *wbc,  		u64 delalloc_start, unsigned long *nr_written)  { @@ -3432,7 +3429,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,  	while (delalloc_end < page_end) { -		found = find_lock_delalloc_range(inode, page, +		found = find_lock_delalloc_range(&inode->vfs_inode, page,  					       &delalloc_start,  					       &delalloc_end);  		if (!found) { @@ -3449,8 +3446,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,  			 * started, so we don't want to return > 0 unless  			 * things are going well.  			 */ -			ret = ret < 0 ? ret : -EIO; -			goto done; +			return ret < 0 ? ret : -EIO;  		}  		/*  		 * delalloc_end is already one less than the total length, so @@ -3482,10 +3478,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,  		return 1;  	} -	ret = 0; - -done: -	return ret; +	return 0;  }  /* @@ -3496,7 +3489,7 @@ done:   * 0 if all went well (page still locked)   * < 0 if there were errors (page still locked)   */ -static noinline_for_stack int __extent_writepage_io(struct inode *inode, +static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,  				 struct page *page,  				 struct writeback_control *wbc,  				 struct extent_page_data *epd, @@ -3504,7 +3497,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  				 unsigned long nr_written,  				 int *nr_ret)  { -	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; +	struct extent_io_tree *tree = &inode->io_tree;  	u64 start = page_offset(page);  	u64 page_end = start + PAGE_SIZE - 1;  	u64 end; @@ -3536,7 +3529,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  	update_nr_written(wbc, nr_written + 1);  	end = page_end; -	blocksize = inode->i_sb->s_blocksize; +	blocksize = inode->vfs_inode.i_sb->s_blocksize;  	while (cur <= end) {  		u64 em_end; @@ -3547,8 +3540,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  							     page_end, 1);  			break;  		} -		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, -				      end - cur + 1); +		em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);  		if (IS_ERR_OR_NULL(em)) {  			SetPageError(page);  			ret = PTR_ERR_OR_ZERO(em); @@ -3585,7 +3577,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,  		btrfs_set_range_writeback(tree, cur, cur + iosize - 1);  		if (!PageWriteback(page)) { -			btrfs_err(BTRFS_I(inode)->root->fs_info, +			btrfs_err(inode->root->fs_info,  				   "page %lu not writeback, cur %llu end %llu",  			       page->index, cur, end);  		} @@ -3658,15 +3650,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,  	set_page_extent_mapped(page);  	if (!epd->extent_locked) { -		ret = writepage_delalloc(inode, page, wbc, start, &nr_written); +		ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, +					 &nr_written);  		if (ret == 1)  			return 0;  		if (ret)  			goto done;  	} -	ret = __extent_writepage_io(inode, page, wbc, epd, -				    i_size, nr_written, &nr); +	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, +				    nr_written, &nr);  	if (ret == 1)  		return 0; @@ -4126,7 +4119,7 @@ retry:  	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {  		ret = flush_write_bio(&epd);  	} else { -		ret = -EUCLEAN; +		ret = -EROFS;  		end_write_bio(&epd, ret);  	}  	return ret; @@ -4488,6 +4481,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)  	    page->mapping->host->i_size > SZ_16M) {  		u64 len;  		while (start <= end) { +			struct btrfs_fs_info *fs_info; +			u64 cur_gen; +  			len = end - start + 1;  			write_lock(&map->lock);  			em = lookup_extent_mapping(map, start, len); @@ -4501,20 +4497,52 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)  				free_extent_map(em);  				break;  			} -			if (!test_range_bit(tree, em->start, -					    extent_map_end(em) - 1, -					    EXTENT_LOCKED, 0, NULL)) { -				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, -					&btrfs_inode->runtime_flags); -				remove_extent_mapping(map, em); -				/* once for the rb tree */ -				free_extent_map(em); -			} +			if (test_range_bit(tree, em->start, +					   extent_map_end(em) - 1, +					   EXTENT_LOCKED, 0, NULL)) +				goto next; +			/* +			 * If it's not in the list of modified extents, used +			 * by a fast fsync, we can remove it. If it's being +			 * logged we can safely remove it since fsync took an +			 * extra reference on the em. +			 */ +			if (list_empty(&em->list) || +			    test_bit(EXTENT_FLAG_LOGGING, &em->flags)) +				goto remove_em; +			/* +			 * If it's in the list of modified extents, remove it +			 * only if its generation is older then the current one, +			 * in which case we don't need it for a fast fsync. +			 * Otherwise don't remove it, we could be racing with an +			 * ongoing fast fsync that could miss the new extent. +			 */ +			fs_info = btrfs_inode->root->fs_info; +			spin_lock(&fs_info->trans_lock); +			cur_gen = fs_info->generation; +			spin_unlock(&fs_info->trans_lock); +			if (em->generation >= cur_gen) +				goto next; +remove_em: +			/* +			 * We only remove extent maps that are not in the list of +			 * modified extents or that are in the list but with a +			 * generation lower then the current generation, so there +			 * is no need to set the full fsync flag on the inode (it +			 * hurts the fsync performance for workloads with a data +			 * size that exceeds or is close to the system's memory). +			 */ +			remove_extent_mapping(map, em); +			/* once for the rb tree */ +			free_extent_map(em); +next:  			start = extent_map_end(em);  			write_unlock(&map->lock);  			/* once for us */  			free_extent_map(em); + +			cond_resched(); /* Allow large-extent preemption. */  		}  	}  	return try_release_extent_state(tree, page, mask); @@ -4669,7 +4697,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,  }  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, -		__u64 start, __u64 len) +		  u64 start, u64 len)  {  	int ret = 0;  	u64 off = start; @@ -5058,25 +5086,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,  static void check_buffer_tree_ref(struct extent_buffer *eb)  {  	int refs; -	/* the ref bit is tricky.  We have to make sure it is set -	 * if we have the buffer dirty.   Otherwise the -	 * code to free a buffer can end up dropping a dirty -	 * page +	/* +	 * The TREE_REF bit is first set when the extent_buffer is added +	 * to the radix tree. It is also reset, if unset, when a new reference +	 * is created by find_extent_buffer.  	 * -	 * Once the ref bit is set, it won't go away while the -	 * buffer is dirty or in writeback, and it also won't -	 * go away while we have the reference count on the -	 * eb bumped. +	 * It is only cleared in two cases: freeing the last non-tree +	 * reference to the extent_buffer when its STALE bit is set or +	 * calling releasepage when the tree reference is the only reference.  	 * -	 * We can't just set the ref bit without bumping the -	 * ref on the eb because free_extent_buffer might -	 * see the ref bit and try to clear it.  If this happens -	 * free_extent_buffer might end up dropping our original -	 * ref by mistake and freeing the page before we are able -	 * to add one more ref. +	 * In both cases, care is taken to ensure that the extent_buffer's +	 * pages are not under io. However, releasepage can be concurrently +	 * called with creating new references, which is prone to race +	 * conditions between the calls to check_buffer_tree_ref in those +	 * codepaths and clearing TREE_REF in try_release_extent_buffer.  	 * -	 * So bump the ref count first, then set the bit.  If someone -	 * beat us to it, drop the ref we added. +	 * The actual lifetime of the extent_buffer in the radix tree is +	 * adequately protected by the refcount, but the TREE_REF bit and +	 * its corresponding reference are not. To protect against this +	 * class of races, we call check_buffer_tree_ref from the codepaths +	 * which trigger io after they set eb->io_pages. Note that once io is +	 * initiated, TREE_REF can no longer be cleared, so that is the +	 * moment at which any such race is best fixed.  	 */  	refs = atomic_read(&eb->refs);  	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5527,6 +5558,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)  	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);  	eb->read_mirror = 0;  	atomic_set(&eb->io_pages, num_reads); +	/* +	 * It is possible for releasepage to clear the TREE_REF bit before we +	 * set io_pages. See check_buffer_tree_ref for a more detailed comment. +	 */ +	check_buffer_tree_ref(eb);  	for (i = 0; i < num_pages; i++) {  		page = eb->pages[i];  |