diff options
Diffstat (limited to 'fs/bcachefs/io_write.c')
| -rw-r--r-- | fs/bcachefs/io_write.c | 125 | 
1 files changed, 69 insertions, 56 deletions
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f137252bccc5..9401d13e31bb 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -166,7 +166,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,  	bch2_trans_copy_iter(&iter, extent_iter);  	for_each_btree_key_upto_continue_norestart(iter, -				new->k.p, BTREE_ITER_SLOTS, old, ret) { +				new->k.p, BTREE_ITER_slots, old, ret) {  		s64 sectors = min(new->k.p.offset, old.k->p.offset) -  			max(bkey_start_offset(&new->k),  			    bkey_start_offset(old.k)); @@ -199,9 +199,6 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,  						    u64 new_i_size,  						    s64 i_sectors_delta)  { -	struct btree_iter iter; -	struct bkey_i *k; -	struct bkey_i_inode_v3 *inode;  	/*  	 * Crazy performance optimization:  	 * Every extent update needs to also update the inode: the inode trigger @@ -213,26 +210,37 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,  	 * to be journalled - if we crash, the bi_journal_seq update will be  	 * lost, but that's fine.  	 */ -	unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; -	int ret; +	unsigned inode_update_flags = BTREE_UPDATE_nojournal; -	k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, +	struct btree_iter iter; +	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,  			      SPOS(0,  				   extent_iter->pos.inode,  				   extent_iter->snapshot), -			      BTREE_ITER_CACHED); -	ret = PTR_ERR_OR_ZERO(k); +			      BTREE_ITER_cached); +	int ret = bkey_err(k);  	if (unlikely(ret))  		return ret; -	if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { -		k = bch2_inode_to_v3(trans, k); -		ret = PTR_ERR_OR_ZERO(k); +	/* +	 * varint_decode_fast(), in the inode .invalid method, reads up to 7 +	 * bytes past the end of the buffer: +	 */ +	struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); +	ret = PTR_ERR_OR_ZERO(k_mut); +	if (unlikely(ret)) +		goto err; + +	bkey_reassemble(k_mut, k); + +	if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { +		k_mut = bch2_inode_to_v3(trans, k_mut); +		ret = PTR_ERR_OR_ZERO(k_mut);  		if (unlikely(ret))  			goto err;  	} -	inode = bkey_i_to_inode_v3(k); +	struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut);  	if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&  	    new_i_size > le64_to_cpu(inode->v.bi_size)) { @@ -251,7 +259,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,  	}  	ret = bch2_trans_update(trans, &iter, &inode->k_i, -				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| +				BTREE_UPDATE_internal_snapshot_node|  				inode_update_flags);  err:  	bch2_trans_iter_exit(trans, &iter); @@ -360,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op)  		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,  				     bkey_start_pos(&sk.k->k), -				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT); +				     BTREE_ITER_slots|BTREE_ITER_intent);  		ret =   bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:  			bch2_extent_update(trans, inum, &iter, sk.k, @@ -399,13 +407,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,  	BUG_ON(c->opts.nochanges);  	bkey_for_each_ptr(ptrs, ptr) { -		BUG_ON(!bch2_dev_exists2(c, ptr->dev)); - -		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); +		struct bch_dev *ca = nocow +			? bch2_dev_have_ref(c, ptr->dev) +			: bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);  		if (to_entry(ptr + 1) < ptrs.end) { -			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, -						GFP_NOFS, &ca->replica_set)); +			n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));  			n->bio.bi_end_io	= wbio->bio.bi_end_io;  			n->bio.bi_private	= wbio->bio.bi_private; @@ -422,11 +429,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,  		n->c			= c;  		n->dev			= ptr->dev; -		n->have_ioref		= nocow || bch2_dev_get_ioref(ca, -					type == BCH_DATA_btree ? READ : WRITE); +		n->have_ioref		= ca != NULL;  		n->nocow		= nocow;  		n->submit_time		= local_clock();  		n->inode_offset		= bkey_start_offset(&k->k); +		if (nocow) +			n->nocow_bucket	= PTR_BUCKET_NR(ca, ptr);  		n->bio.bi_iter.bi_sector = ptr->offset;  		if (likely(n->have_ioref)) { @@ -473,7 +481,6 @@ static void bch2_write_done(struct closure *cl)  static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)  {  	struct keylist *keys = &op->insert_keys; -	struct bch_extent_ptr *ptr;  	struct bkey_i *src, *dst = keys->keys, *n;  	for (src = keys->keys; src != keys->top; src = n) { @@ -642,7 +649,9 @@ static void bch2_write_endio(struct bio *bio)  	struct bch_write_bio *wbio	= to_wbio(bio);  	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;  	struct bch_fs *c		= wbio->c; -	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev); +	struct bch_dev *ca		= wbio->have_ioref +		? bch2_dev_have_ref(c, wbio->dev) +		: NULL;  	if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,  				    op->pos.inode, @@ -653,8 +662,12 @@ static void bch2_write_endio(struct bio *bio)  		op->flags |= BCH_WRITE_IO_ERROR;  	} -	if (wbio->nocow) +	if (wbio->nocow) { +		bch2_bucket_nocow_unlock(&c->nocow_locks, +					 POS(ca->dev_idx, wbio->nocow_bucket), +					 BUCKET_NOCOW_LOCK_UPDATE);  		set_bit(wbio->dev, op->devs_need_flush->d); +	}  	if (wbio->have_ioref) {  		bch2_latency_acct(ca, wbio->submit_time, WRITE); @@ -1093,30 +1106,21 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,  		return false;  	e = bkey_s_c_to_extent(k); + +	rcu_read_lock();  	extent_for_each_ptr_decode(e, p, entry) { -		if (crc_is_encoded(p.crc) || p.has_ec) +		if (crc_is_encoded(p.crc) || p.has_ec) { +			rcu_read_unlock();  			return false; +		}  		replicas += bch2_extent_ptr_durability(c, &p);  	} +	rcu_read_unlock();  	return replicas >= op->opts.data_replicas;  } -static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -{ -	struct bch_fs *c = op->c; - -	for_each_keylist_key(&op->insert_keys, k) { -		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); - -		bkey_for_each_ptr(ptrs, ptr) -			bch2_bucket_nocow_unlock(&c->nocow_locks, -						 PTR_BUCKET_POS(c, ptr), -						 BUCKET_NOCOW_LOCK_UPDATE); -	} -} -  static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,  						  struct btree_iter *iter,  						  struct bkey_i *orig, @@ -1150,7 +1154,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,  	return  bch2_extent_update_i_size_sectors(trans, iter,  					min(new->k.p.offset << 9, new_i_size), 0) ?:  		bch2_trans_update(trans, iter, new, -				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +				  BTREE_UPDATE_internal_snapshot_node);  }  static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) @@ -1161,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)  	for_each_keylist_key(&op->insert_keys, orig) {  		int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,  				     bkey_start_pos(&orig->k), orig->k.p, -				     BTREE_ITER_INTENT, k, +				     BTREE_ITER_intent, k,  				     NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({  			bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);  		})); @@ -1187,8 +1191,6 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)  static void __bch2_nocow_write_done(struct bch_write_op *op)  { -	bch2_nocow_write_unlock(op); -  	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {  		op->error = -EIO;  	} else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) @@ -1234,12 +1236,16 @@ retry:  	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,  			     SPOS(op->pos.inode, op->pos.offset, snapshot), -			     BTREE_ITER_SLOTS); +			     BTREE_ITER_slots);  	while (1) {  		struct bio *bio = &op->wbio.bio;  		buckets.nr = 0; +		ret = bch2_trans_relock(trans); +		if (ret) +			break; +  		k = bch2_btree_iter_peek_slot(&iter);  		ret = bkey_err(k);  		if (ret) @@ -1259,14 +1265,15 @@ retry:  		/* Get iorefs before dropping btree locks: */  		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);  		bkey_for_each_ptr(ptrs, ptr) { -			struct bpos b = PTR_BUCKET_POS(c, ptr); +			struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); +			if (unlikely(!ca)) +				goto err_get_ioref; + +			struct bpos b = PTR_BUCKET_POS(ca, ptr);  			struct nocow_lock_bucket *l =  				bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b));  			prefetch(l); -			if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) -				goto err_get_ioref; -  			/* XXX allocating memory with btree locks held - rare */  			darray_push_gfp(&buckets, ((struct bucket_to_lock) {  						   .b = b, .gen = ptr->gen, .l = l, @@ -1285,7 +1292,7 @@ retry:  			bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);  		darray_for_each(buckets, i) { -			struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode); +			struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode);  			__bch2_bucket_nocow_lock(&c->nocow_locks, i->l,  						 bucket_to_u64(i->b), @@ -1362,7 +1369,7 @@ err:  	return;  err_get_ioref:  	darray_for_each(buckets, i) -		percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref); +		percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);  	/* Fall back to COW path: */  	goto out; @@ -1483,7 +1490,11 @@ err:  	if ((op->flags & BCH_WRITE_SYNC) ||  	    (!(op->flags & BCH_WRITE_DONE) &&  	     !(op->flags & BCH_WRITE_IN_WORKER))) { -		closure_sync(&op->cl); +		if (closure_sync_timeout(&op->cl, HZ * 10)) { +			bch2_print_allocator_stuck(c); +			closure_sync(&op->cl); +		} +  		__bch2_write_index(op);  		if (!(op->flags & BCH_WRITE_DONE)) @@ -1505,6 +1516,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)  	unsigned sectors;  	int ret; +	memset(&op->failed, 0, sizeof(op->failed)); +  	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;  	op->flags |= BCH_WRITE_DONE; @@ -1639,8 +1652,7 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)  	prt_bitflags(out, bch2_write_flags, op->flags);  	prt_newline(out); -	prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); -	prt_newline(out); +	prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));  	printbuf_indent_sub(out, 2);  } @@ -1648,13 +1660,14 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)  void bch2_fs_io_write_exit(struct bch_fs *c)  {  	mempool_exit(&c->bio_bounce_pages); +	bioset_exit(&c->replica_set);  	bioset_exit(&c->bio_write);  }  int bch2_fs_io_write_init(struct bch_fs *c)  { -	if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), -			BIOSET_NEED_BVECS)) +	if (bioset_init(&c->bio_write,   1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || +	    bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))  		return -BCH_ERR_ENOMEM_bio_write_init;  	if (mempool_init_page_pool(&c->bio_bounce_pages,  |