aboutsummaryrefslogtreecommitdiff
path: root/fs/bcachefs/data_update.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/data_update.c')
-rw-r--r--fs/bcachefs/data_update.c200
1 files changed, 161 insertions, 39 deletions
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 899ff46de8e0..0022b51ce3c0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -13,6 +13,8 @@
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
+#include "rebalance.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "trace.h"
@@ -161,11 +163,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
- /*
- * See comment below:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
- */
rewrites_found |= 1U << i;
}
i++;
@@ -211,14 +209,8 @@ restart_drop_extra_replicas:
if (!p.ptr.cached &&
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
- /*
- * Currently, we're dropping unneeded replicas
- * instead of marking them as cached, since
- * cached data in stripe buckets prevents them
- * from being reused:
+
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
- */
goto restart_drop_extra_replicas;
}
}
@@ -248,19 +240,59 @@ restart_drop_extra_replicas:
next_pos = insert->k.p;
+ /*
+ * Check for nonce offset inconsistency:
+ * This is debug code - we've been seeing this bug rarely, and
+ * it's been hard to reproduce, so this should give us some more
+ * information when it does occur:
+ */
+ struct printbuf err = PRINTBUF;
+ int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+ printbuf_exit(&err);
+
+ if (invalid) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ goto out;
+ }
+
+ if (trace_data_update_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_data_update(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, insert->k.p);
- if (ret)
- goto err;
-
- ret = bch2_trans_update(trans, &iter, insert,
+ k.k->p, insert->k.p) ?:
+ bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
+ bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
@@ -281,14 +313,14 @@ next:
}
continue;
nowork:
- if (m->ctxt && m->ctxt->stats) {
+ if (m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->ctxt->stats->keys_raced);
+ atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->stats->sectors_raced);
+ &m->stats->sectors_raced);
}
- this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
+ count_event(c, move_extent_fail);
bch2_btree_iter_advance(&iter);
goto next;
@@ -323,7 +355,6 @@ void bch2_data_update_exit(struct data_update *update)
struct bch_fs *c = update->op.c;
struct bkey_ptrs_c ptrs =
bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr) {
if (c->opts.nocow_enabled)
@@ -337,14 +368,13 @@ void bch2_data_update_exit(struct data_update *update)
bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
}
-void bch2_update_unwritten_extent(struct btree_trans *trans,
+static void bch2_update_unwritten_extent(struct btree_trans *trans,
struct data_update *update)
{
struct bch_fs *c = update->op.c;
struct bio *bio = &update->op.wbio.bio;
struct bkey_i_extent *e;
struct write_point *wp;
- struct bch_extent_ptr *ptr;
struct closure cl;
struct btree_iter iter;
struct bkey_s_c k;
@@ -385,6 +415,8 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
continue;
}
+ bch_err_fn_ratelimited(c, ret);
+
if (ret)
return;
@@ -417,7 +449,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
}
}
+int bch2_extent_drop_ptrs(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct data_update_opts data_opts)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i *n;
+ int ret;
+
+ n = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ while (data_opts.kill_ptrs) {
+ unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+ struct bch_extent_ptr *ptr;
+
+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+ data_opts.kill_ptrs ^= 1U << drop;
+ }
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, bkey_i_to_s(n));
+
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ n->k.size = 0;
+
+ return bch2_trans_relock(trans) ?:
+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
+
int bch2_data_update_init(struct btree_trans *trans,
+ struct btree_iter *iter,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
@@ -430,15 +506,24 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- const struct bch_extent_ptr *ptr;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
unsigned ptrs_locked = 0;
- int ret;
+ int ret = 0;
+
+ /*
+ * fs is corrupt we have a key for a snapshot node that doesn't exist,
+ * and we have to check for this because we go rw before repairing the
+ * snapshots table - just skip it, we can move it later.
+ */
+ if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+ return -BCH_ERR_data_update_done;
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
m->data_opts = data_opts;
+ m->ctxt = ctxt;
+ m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
@@ -451,12 +536,14 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
+ m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr)
percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref);
+ unsigned durability_have = 0, durability_removing = 0;
+
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
bool locked;
@@ -468,8 +555,11 @@ int bch2_data_update_init(struct btree_trans *trans,
reserve_sectors += k.k->size;
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
- } else if (!p.ptr.cached) {
+ durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+ } else if (!p.ptr.cached &&
+ !((1U << i) & m->data_opts.kill_ptrs)) {
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+ durability_have += bch2_extent_ptr_durability(c, &p);
}
/*
@@ -487,10 +577,10 @@ int bch2_data_update_init(struct btree_trans *trans,
if (c->opts.nocow_enabled) {
if (ctxt) {
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
- !atomic_read(&ctxt->read_sectors));
+ list_empty(&ctxt->ios));
if (!locked)
bch2_bucket_nocow_lock(&c->nocow_locks,
@@ -508,6 +598,39 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
+ unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
+ /*
+ * If current extent durability is less than io_opts.data_replicas,
+ * we're not trying to rereplicate the extent up to data_replicas here -
+ * unless extra_replicas was specified
+ *
+ * Increasing replication is an explicit operation triggered by
+ * rereplicate, currently, so that users don't get an unexpected -ENOSPC
+ */
+ if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
+ !durability_required) {
+ m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+ m->data_opts.rewrite_ptrs = 0;
+ /* if iter == NULL, it's just a promote */
+ if (iter)
+ ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
+ goto done;
+ }
+
+ m->op.nr_replicas = min(durability_removing, durability_required) +
+ m->data_opts.extra_replicas;
+
+ /*
+ * If device(s) were set to durability=0 after data was written to them
+ * we can end up with a duribilty=0 extent, and the normal algorithm
+ * that tries not to increase durability doesn't work:
+ */
+ if (!(durability_have + durability_removing))
+ m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+ m->op.nr_replicas_required = m->op.nr_replicas;
+
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas
@@ -517,14 +640,11 @@ int bch2_data_update_init(struct btree_trans *trans,
goto err;
}
- m->op.nr_replicas += m->data_opts.extra_replicas;
- m->op.nr_replicas_required = m->op.nr_replicas;
-
- BUG_ON(!m->op.nr_replicas);
+ if (bkey_extent_is_unwritten(k)) {
+ bch2_update_unwritten_extent(trans, m);
+ goto done;
+ }
- /* Special handling required: */
- if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_unwritten_extent_update;
return 0;
err:
i = 0;
@@ -539,12 +659,14 @@ err:
bch2_bkey_buf_exit(&m->k, c);
bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
return ret;
+done:
+ bch2_data_update_exit(m);
+ return ret ?: -BCH_ERR_data_update_done;
}
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {