diff options
Diffstat (limited to 'fs/bcachefs/ec.c')
-rw-r--r-- | fs/bcachefs/ec.c | 303 |
1 files changed, 219 insertions, 84 deletions
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 141a4c63142f..1587c6e1866a 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -18,6 +18,7 @@ #include "ec.h" #include "error.h" #include "io_read.h" +#include "io_write.h" #include "keylist.h" #include "recovery.h" #include "replicas.h" @@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, bch2_prt_csum_type(out, s.csum_type); prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + if (s.disk_label) { + prt_str(out, " label"); + bch2_disk_path_to_text(out, c, s.disk_label - 1); + } + for (unsigned i = 0; i < s.nr_blocks; i++) { const struct bch_extent_ptr *ptr = sp->ptrs + i; if ((void *) ptr >= bkey_val_end(k)) break; + prt_char(out, ' '); bch2_extent_ptr_to_text(out, c, ptr); if (s.csum_type < BCH_CSUM_NR && @@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->stripe, s.k->p.offset, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } } else { @@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bucket.inode, bucket.offset, a->gen, a->stripe, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } } @@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); if (unlikely(!ca)) { - if (!(flags & BTREE_TRIGGER_overwrite)) - ret = -EIO; + if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) + ret = -BCH_ERR_mark_stripe; goto err; } @@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err_unlock; } @@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } +static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) +{ + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->disk_label = s->disk_label; + m->blocks_nonempty = 0; + + for (unsigned i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); +} + int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, memset(m, 0, sizeof(*m)); } else { - m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + stripe_to_mem(m, new_s); if (!old_s) bch2_stripes_heap_insert(c, m, idx); @@ -816,13 +829,16 @@ err: } /* recovery read path: */ -int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) +int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct ec_stripe_buf *buf; + struct ec_stripe_buf *buf = NULL; struct closure cl; struct bch_stripe *v; unsigned i, offset; + const char *msg = NULL; + struct printbuf msgbuf = PRINTBUF; int ret = 0; closure_init_stack(&cl); @@ -835,32 +851,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { - bch_err_ratelimited(c, - "error doing reconstruct read: error %i looking up stripe", ret); - kfree(buf); - return -EIO; + msg = "stripe not found"; + goto err; } v = &bkey_i_to_stripe(&buf->key)->v; if (!bch2_ptr_matches_stripe(v, rbio->pick)) { - bch_err_ratelimited(c, - "error doing reconstruct read: pointer doesn't match stripe"); - ret = -EIO; + msg = "pointer doesn't match stripe"; goto err; } offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { - bch_err_ratelimited(c, - "error doing reconstruct read: read is bigger than stripe"); - ret = -EIO; + msg = "read is bigger than stripe"; goto err; } ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); - if (ret) + if (ret) { + msg = "-ENOMEM"; goto err; + } for (i = 0; i < v->nr_blocks; i++) ec_block_io(c, buf, REQ_OP_READ, i, &cl); @@ -868,9 +880,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { - bch_err_ratelimited(c, - "error doing reconstruct read: unable to read enough blocks"); - ret = -EIO; + msg = "unable to read enough blocks"; goto err; } @@ -882,10 +892,17 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -err: +out: ec_stripe_buf_exit(buf); kfree(buf); return ret; +err: + bch2_bkey_val_to_text(&msgbuf, c, orig_k); + bch_err_ratelimited(c, + "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); + printbuf_exit(&msgbuf);; + ret = -BCH_ERR_stripe_reconstruct; + goto out; } /* stripe bucket accounting: */ @@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bkey_reassemble(n, k); - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); + bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); BUG_ON(!ec_ptr); @@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); } -static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) +static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s = h->s; + lockdep_assert_held(&h->lock); + BUG_ON(!s->allocated && !s->err); h->s = NULL; @@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) ec_stripe_new_put(c, s, STRIPE_REF_io); } +static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) +{ + h->s->err = err; + ec_stripe_new_set_pending(c, h); +} + void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) { struct ec_stripe_new *s = ob->ec; @@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c, struct bkey_i *k, unsigned nr_data, unsigned nr_parity, - unsigned stripe_size) + unsigned stripe_size, + unsigned disk_label) { struct bkey_i_stripe *s = bkey_stripe_init(k); unsigned u64s; @@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.nr_redundant = nr_parity; s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); s->v.csum_type = BCH_CSUM_crc32c; - s->v.pad = 0; + s->v.disk_label = disk_label; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { BUG_ON(1 << s->v.csum_granularity_bits >= @@ -1685,40 +1711,32 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s->nr_parity = h->redundancy; ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, h->blocksize); + s->nr_data, s->nr_parity, + h->blocksize, h->disk_label); h->s = s; + h->nr_created++; return 0; } -static struct ec_stripe_head * -ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, - unsigned algo, unsigned redundancy, - enum bch_watermark watermark) +static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { - struct ec_stripe_head *h; - - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) - return NULL; - - mutex_init(&h->lock); - BUG_ON(!mutex_trylock(&h->lock)); - - h->target = target; - h->algo = algo; - h->redundancy = redundancy; - h->watermark = watermark; + struct bch_devs_mask devs = h->devs; rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, target); + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + unsigned nr_devs = dev_mask_nr(&h->devs); for_each_member_device_rcu(c, ca, &h->devs) if (!ca->mi.durability) __clear_bit(ca->dev_idx, h->devs.d); + unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); h->blocksize = pick_blocksize(c, &h->devs); + h->nr_active_devs = 0; for_each_member_device_rcu(c, ca, &h->devs) if (ca->mi.bucket_size == h->blocksize) h->nr_active_devs++; @@ -1729,9 +1747,50 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, * If we only have redundancy + 1 devices, we're better off with just * replication: */ - if (h->nr_active_devs < h->redundancy + 2) - bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", - h->nr_active_devs, h->redundancy + 2); + h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; + + if (h->insufficient_devs) { + const char *err; + + if (nr_devs < h->redundancy + 2) + err = NULL; + else if (nr_devs_with_durability < h->redundancy + 2) + err = "cannot use durability=0 devices"; + else + err = "mismatched bucket sizes"; + + if (err) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", + h->nr_active_devs, h->redundancy + 2, err); + } + + struct bch_devs_mask devs_leaving; + bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX); + + if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving)) + ec_stripe_new_cancel(c, h, -EINTR); + + h->rw_devs_change_count = c->rw_devs_change_count; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, + unsigned algo, unsigned redundancy, + enum bch_watermark watermark) +{ + struct ec_stripe_head *h; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + BUG_ON(!mutex_trylock(&h->lock)); + + h->disk_label = disk_label; + h->algo = algo; + h->redundancy = redundancy; + h->watermark = watermark; list_add(&h->list, &c->ec_stripe_head_list); return h; @@ -1743,14 +1802,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) h->s->allocated && bitmap_weight(h->s->blocks_allocated, h->s->nr_data) == h->s->nr_data) - ec_stripe_set_pending(c, h); + ec_stripe_new_set_pending(c, h); mutex_unlock(&h->lock); } static struct ec_stripe_head * __bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + unsigned disk_label, unsigned algo, unsigned redundancy, enum bch_watermark watermark) @@ -1768,27 +1827,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); - goto found; + goto err; } list_for_each_entry(h, &c->ec_stripe_head_list, list) - if (h->target == target && + if (h->disk_label == disk_label && h->algo == algo && h->redundancy == redundancy && h->watermark == watermark) { ret = bch2_trans_mutex_lock(trans, &h->lock); - if (ret) + if (ret) { h = ERR_PTR(ret); + goto err; + } goto found; } - h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); + h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); found: - if (!IS_ERR_OR_NULL(h) && - h->nr_active_devs < h->redundancy + 2) { + if (h->rw_devs_change_count != c->rw_devs_change_count) + ec_stripe_head_devs_update(c, h); + + if (h->insufficient_devs) { mutex_unlock(&h->lock); h = NULL; } +err: mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1878,7 +1942,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ return 0; } -/* XXX: doesn't obey target: */ static s64 get_existing_stripe(struct bch_fs *c, struct ec_stripe_head *head) { @@ -1901,7 +1964,8 @@ static s64 get_existing_stripe(struct bch_fs *c, m = genradix_ptr(&c->stripes, stripe_idx); - if (m->algorithm == head->algo && + if (m->disk_label == head->disk_label && + m->algorithm == head->algo && m->nr_redundant == head->redundancy && m->sectors == head->blocksize && m->blocks_nonempty < m->nr_blocks - m->nr_redundant && @@ -2046,9 +2110,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, struct bch_fs *c = trans->c; struct ec_stripe_head *h; bool waiting = false; + unsigned disk_label = 0; + struct target t = target_decode(target); int ret; - h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); + if (t.type == TARGET_GROUP) { + if (t.group > U8_MAX) { + bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); + return NULL; + } + disk_label = t.group + 1; /* 0 == no label */ + } + + h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); if (IS_ERR_OR_NULL(h)) return h; @@ -2126,6 +2200,73 @@ err: return ERR_PTR(ret); } +/* device removal */ + +static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); + return -BCH_ERR_invalidate_stripe_to_dev; + } + + struct btree_iter iter; + struct bkey_i_stripe *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + s64 sectors = 0; + for (unsigned i = 0; i < s->v.nr_blocks; i++) + sectors -= stripe_blockcount_get(&s->v, i); + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == k_a.k->p.inode) + ptr->dev = BCH_SB_MEMBER_INVALID; + + sectors = -sectors; + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +{ + return bch2_trans_run(c, + for_each_btree_key_upto_commit(trans, iter, + BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), + BTREE_ITER_intent, k, + NULL, NULL, 0, ({ + bch2_invalidate_stripe_to_dev(trans, k); + }))); +} + +/* startup/shutdown */ + static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; @@ -2151,8 +2292,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) } goto unlock; found: - h->s->err = -BCH_ERR_erofs_no_writes; - ec_stripe_set_pending(c, h); + ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); unlock: mutex_unlock(&h->lock); } @@ -2197,17 +2337,9 @@ int bch2_stripes_read(struct bch_fs *c) if (ret) break; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + stripe_to_mem(m, bkey_s_c_to_stripe(k).v); bch2_stripes_heap_insert(c, m, k.k->p.offset); 0; @@ -2252,6 +2384,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) prt_printf(out, " %u", s->blocks[i]); prt_newline(out); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); + prt_newline(out); } void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) @@ -2261,9 +2395,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "target %u algo %u redundancy %u %s:\n", - h->target, h->algo, h->redundancy, - bch2_watermarks[h->watermark]); + prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", + h->disk_label, h->algo, h->redundancy, + bch2_watermarks[h->watermark], + h->nr_created); if (h->s) bch2_new_stripe_to_text(out, c, h->s); |