diff options
Diffstat (limited to 'fs/bcachefs')
82 files changed, 1797 insertions, 1112 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index c08c2c7d6fbb..fddc7be58022 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -33,6 +33,18 @@ config BCACHEFS_QUOTA depends on BCACHEFS_FS select QUOTACTL +config BCACHEFS_ERASURE_CODING + bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)" + depends on BCACHEFS_FS + select QUOTACTL + help + This enables the "erasure_code" filesysystem and inode option, which + organizes data into reed-solomon stripes instead of ordinary + replication. + + WARNING: this feature is still undergoing on disk format changes, and + should only be enabled for testing purposes. + config BCACHEFS_POSIX_ACL bool "bcachefs POSIX ACL support" depends on BCACHEFS_FS diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 45b64f89258c..b81268418174 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -28,6 +28,7 @@ bcachefs-y := \ clock.o \ compress.o \ counters.o \ + darray.o \ debug.o \ dirent.o \ disk_groups.o \ @@ -70,6 +71,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ sb-clean.o \ + sb-downgrade.o \ sb-errors.o \ sb-members.o \ siphash.o \ diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index f3809897f00a..3640f417cce1 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -366,7 +366,8 @@ retry: bch2_trans_begin(trans); acl = _acl; - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), + ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: + bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b85c7765272f..0e6157982607 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1297,6 +1297,30 @@ out: return wp; } +static noinline void +deallocate_extra_replicas(struct bch_fs *c, + struct open_buckets *ptrs, + struct open_buckets *ptrs_no_use, + unsigned extra_replicas) +{ + struct open_buckets ptrs2 = { 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) { + unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + if (d && d <= extra_replicas) { + extra_replicas -= d; + ob_push(c, ptrs_no_use, ob); + } else { + ob_push(c, &ptrs2, ob); + } + } + + *ptrs = ptrs2; +} + /* * Get us an open_bucket we can allocate from, return with it locked: */ @@ -1321,6 +1345,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, int ret; int i; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) + erasure_code = false; + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); BUG_ON(!nr_replicas || !nr_replicas_required); @@ -1347,8 +1374,17 @@ retry: goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - goto allocate_blocking; + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { + int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); + if (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + goto alloc_done; + } /* * Only try to allocate cache (durability = 0 devices) from the @@ -1362,7 +1398,6 @@ retry: &have_cache, watermark, flags, cl); } else { -allocate_blocking: ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, @@ -1382,6 +1417,9 @@ alloc_done: if (ret) goto err; + if (nr_effective > nr_replicas) + deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) open_bucket_free_unused(c, ob); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ef02c9bb0354..23c0834a97a4 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, bp.level - 1, 0); b = bch2_btree_iter_peek_node(iter); - if (IS_ERR(b)) + if (IS_ERR_OR_NULL(b)) goto err; BUG_ON(b->c.level != bp.level - 1); - if (b && extent_matches_bp(c, bp.btree_id, bp.level, - bkey_i_to_s_c(&b->key), - bucket, bp)) + if (extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) return b; - if (b && btree_node_will_make_reachable(b)) { + if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9cb8684959ee..b62737fdf5ab 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -617,7 +617,7 @@ struct journal_seq_blacklist_table { u64 start; u64 end; bool dirty; - } entries[0]; + } entries[]; }; struct journal_keys { @@ -638,6 +638,8 @@ struct journal_keys { size_t gap; size_t nr; size_t size; + atomic_t ref; + bool initial_ref_held; }; struct btree_trans_buf { @@ -735,6 +737,7 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; + unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; } sb; @@ -929,7 +932,7 @@ struct bch_fs { mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; mempool_t decompress_workspace; - ZSTD_parameters zstd_params; + size_t zstd_workspace_size; struct crypto_shash *sha256; struct crypto_sync_skcipher *chacha20; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0a750953ff92..fe78e87603fc 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -151,7 +151,11 @@ struct bpos { #else #error edit for your odd byteorder. #endif -} __packed __aligned(4); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +__aligned(4) +#endif +; #define KEY_INODE_MAX ((__u64)~0ULL) #define KEY_OFFSET_MAX ((__u64)~0ULL) @@ -1203,19 +1207,21 @@ struct bch_sb_field { }; #define BCH_SB_FIELDS() \ - x(journal, 0) \ - x(members_v1, 1) \ - x(crypt, 2) \ - x(replicas_v0, 3) \ - x(quota, 4) \ - x(disk_groups, 5) \ - x(clean, 6) \ - x(replicas, 7) \ - x(journal_seq_blacklist, 8) \ - x(journal_v2, 9) \ - x(counters, 10) \ - x(members_v2, 11) \ - x(errors, 12) + x(journal, 0) \ + x(members_v1, 1) \ + x(crypt, 2) \ + x(replicas_v0, 3) \ + x(quota, 4) \ + x(disk_groups, 5) \ + x(clean, 6) \ + x(replicas, 7) \ + x(journal_seq_blacklist, 8) \ + x(journal_v2, 9) \ + x(counters, 10) \ + x(members_v2, 11) \ + x(errors, 12) \ + x(ext, 13) \ + x(downgrade, 14) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1528,7 +1534,7 @@ struct bch_sb_field_disk_groups { x(move_extent_write, 36) \ x(move_extent_finish, 37) \ x(move_extent_fail, 38) \ - x(move_extent_alloc_mem_fail, 39) \ + x(move_extent_start_fail, 39) \ x(copygc, 40) \ x(copygc_wait, 41) \ x(gc_gens_end, 42) \ @@ -1627,6 +1633,24 @@ struct bch_sb_field_errors { LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); +struct bch_sb_field_ext { + struct bch_sb_field field; + __le64 recovery_passes_required[2]; + __le64 errors_silent[8]; +}; + +struct bch_sb_field_downgrade_entry { + __le16 version; + __le64 recovery_passes[2]; + __le16 nr_errors; + __le16 errors[] __counted_by(nr_errors); +} __packed __aligned(2); + +struct bch_sb_field_downgrade { + struct bch_sb_field field; + struct bch_sb_field_downgrade_entry entries[]; +}; + /* Superblock: */ /* @@ -1640,6 +1664,11 @@ LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); #define RECOVERY_PASS_ALL_FSCK (1ULL << 63) +/* + * field 1: version name + * field 2: BCH_VERSION(major, minor) + * field 3: recovery passess required on upgrade + */ #define BCH_METADATA_VERSIONS() \ x(bkey_renumber, BCH_VERSION(0, 10), \ RECOVERY_PASS_ALL_FSCK) \ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 47e7770d0583..79495cd7a794 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -9,6 +9,7 @@ #include "debug.h" #include "errcode.h" #include "error.h" +#include "journal.h" #include "trace.h" #include <linux/prefetch.h> @@ -424,14 +425,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - if (btree_node_dirty(b)) - bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty_acct(c, b); - btree_node_data_free(c, b); } - BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(!bch2_journal_error(&c->journal) && + atomic_read(&c->btree_cache.dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0b5d09c8475d..30ab78a24517 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1541,8 +1541,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ ca = bch_dev_bkey_exists(c, k.k->p.inode); g = gc_bucket(ca, k.k->p.offset); @@ -1561,8 +1561,9 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) g->stripe = a->stripe; g->stripe_redundancy = a->stripe_redundancy; } - } - bch2_trans_iter_exit(trans, &iter); + + 0; + })); err: bch2_trans_put(trans); if (ret) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 37d896edb06e..5a720f0cd5a6 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void * return offset; } -static void btree_node_read_all_replicas_done(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) { - struct btree_node_read_all *ra = - container_of(cl, struct btree_node_read_all, cl); + closure_type(ra, struct btree_node_read_all, cl); struct bch_fs *c = ra->c; struct btree *b = ra->b; struct printbuf buf = PRINTBUF; @@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool if (sync) { closure_sync(&ra->cl); - btree_node_read_all_replicas_done(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, c->io_complete_wq); @@ -1705,8 +1704,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } -void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, - struct btree_write *w) +static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + struct btree_write *w) { unsigned long old, new, v = READ_ONCE(b->will_make_reachable); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 7e03dd76fb38..e0d7fa5b1dfb 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -134,9 +134,6 @@ void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); -void bch2_btree_complete_write(struct bch_fs *, struct btree *, - struct btree_write *); - bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index c2adf3fbb0b3..da594e006769 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2085,18 +2085,16 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e goto out_no_locked; /* - * iter->pos should be mononotically increasing, and always be - * equal to the key we just returned - except extents can - * straddle iter->pos: + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + * + * But we can't do the full check here, because bkey_start_pos() + * isn't monotonically increasing before FILTER_SNAPSHOTS, and + * that's what we check against in extents mode: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) - iter_pos = k.k->p; - else - iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - - if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_gt(iter_pos, end) - : bkey_ge(iter_pos, end))) + if (k.k->p.inode > end.inode) goto end; if (iter->update_path && @@ -2155,6 +2153,21 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e continue; } + /* + * iter->pos should be mononotically increasing, and always be + * equal to the key we just returned - except extents can + * straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + iter_pos = k.k->p; + else + iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); + + if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_gt(iter_pos, end) + : bkey_ge(iter_pos, end))) + goto end; + break; } @@ -2981,7 +2994,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->fn_idx = fn_idx; trans->locking_wait.task = current; trans->journal_replay_not_finished = - !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); closure_init_stack(&trans->ref); s = btree_trans_stats(trans); @@ -3087,8 +3101,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - kfree(trans->extra_journal_entries.data); if (trans->fs_usage_deltas) { @@ -3100,6 +3112,9 @@ void bch2_trans_put(struct btree_trans *trans) kfree(trans->fs_usage_deltas); } + if (unlikely(trans->journal_replay_not_finished)) + bch2_journal_keys_put(c); + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); else @@ -3212,10 +3227,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) mempool_exit(&c->btree_trans_pool); } -int bch2_fs_btree_iter_init(struct bch_fs *c) +void bch2_fs_btree_iter_init_early(struct bch_fs *c) { struct btree_transaction_stats *s; - int ret; for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); @@ -3226,6 +3240,11 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) INIT_LIST_HEAD(&c->btree_trans_list); seqmutex_init(&c->btree_trans_lock); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) +{ + int ret; c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); if (!c->btree_trans_bufs) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 85e7cb52f6b6..eaffced4c132 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -938,6 +938,7 @@ unsigned bch2_trans_get_fn_idx(const char *); void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); +void bch2_fs_btree_iter_init_early(struct bch_fs *); int bch2_fs_btree_iter_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 58a981bcf3aa..ec52f50d249d 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -80,6 +80,8 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree struct journal_keys *keys = &c->journal_keys; unsigned iters = 0; struct journal_key *k; + + BUG_ON(*idx > keys->nr); search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); @@ -189,10 +191,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, /* Since @keys was full, there was no gap: */ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); kvfree(keys->d); - *keys = new_keys; + keys->d = new_keys.d; + keys->nr = new_keys.nr; + keys->size = new_keys.size; /* And now the gap is at the end: */ - keys->gap = keys->nr; + keys->gap = keys->nr; } journal_iters_move_gap(c, keys->gap, idx); @@ -415,10 +419,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) cmp_int(l->journal_offset, r->journal_offset); } -void bch2_journal_keys_free(struct journal_keys *keys) +void bch2_journal_keys_put(struct bch_fs *c) { + struct journal_keys *keys = &c->journal_keys; struct journal_key *i; + BUG_ON(atomic_read(&keys->ref) <= 0); + + if (!atomic_dec_and_test(&keys->ref)) + return; + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; @@ -429,6 +439,8 @@ void bch2_journal_keys_free(struct journal_keys *keys) kvfree(keys->d); keys->d = NULL; keys->nr = keys->gap = keys->size = 0; + + bch2_journal_entries_free(c); } static void __journal_keys_sort(struct journal_keys *keys) diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 5d64e7e22f26..8ca4c100b2e3 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_keys_put(struct bch_fs *); + +static inline void bch2_journal_keys_put_initial(struct bch_fs *c) +{ + if (c->journal_keys.initial_ref_held) + bch2_journal_keys_put(c); + c->journal_keys.initial_ref_held = false; +} + void bch2_journal_entries_free(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9b78f78a75b5..1b7a5668df7c 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - if (ck->c.lock.readers) + if (ck->c.lock.readers) { list_move_tail(&ck->list, &bc->freed_pcpu); - else + bc->nr_freed_pcpu++; + } else { list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; + } atomic_long_inc(&bc->nr_freed); kfree(ck->k); @@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, { struct bkey_cached *pos; + bc->nr_freed_nonpcpu++; + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, pos->btree_trans_barrier_seq)) { @@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, #else mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; mutex_unlock(&bc->lock); #endif } else { @@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, f->nr < ARRAY_SIZE(f->objs) / 2) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; f->objs[f->nr++] = ck; } @@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_nonpcpu)) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; } mutex_unlock(&bc->lock); #endif @@ -664,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; bch2_journal_pin_drop(j, &ck->journal); - bch2_journal_preres_put(j, &ck->res); BUG_ON(!btree_node_locked(c_iter.path, 0)); @@ -762,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - int difference; - - BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); - - difference = jset_u64s(insert->k.u64s) - ck->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - ck->res.u64s += difference; - } - } - bkey_copy(ck->k, insert); ck->valid = true; @@ -850,6 +845,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ + scanned += bc->nr_freed_nonpcpu; + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -859,13 +856,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_nonpcpu--; } if (scanned >= nr) goto out; + scanned += bc->nr_freed_pcpu; + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -875,8 +874,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_pcpu--; } if (scanned >= nr) @@ -982,6 +981,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } #endif + BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); + BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); + list_splice(&bc->freed_pcpu, &items); list_splice(&bc->freed_nonpcpu, &items); @@ -990,9 +992,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_for_each_entry_safe(ck, n, &items, list) { cond_resched(); - bch2_journal_pin_drop(&c->journal, &ck->journal); - bch2_journal_preres_put(&c->journal, &ck->res); - list_del(&ck->list); kfree(ck->k); six_lock_exit(&ck->c.lock); diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h new file mode 100644 index 000000000000..290e4e57df5b --- /dev/null +++ b/fs/bcachefs/btree_key_cache_types.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H +#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H + +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + + struct list_head freed_pcpu; + size_t nr_freed_pcpu; + struct list_head freed_nonpcpu; + size_t nr_freed_nonpcpu; + + struct shrinker *shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index decad7b66c59..12907beda98c 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, bch2_btree_init_next(trans, b); } +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int bch2_trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + EBUG_ON(trans->write_locked); + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + trans->write_locked = true; + return 0; +} + +static inline void bch2_trans_unlock_write(struct btree_trans *trans) +{ + if (likely(trans->write_locked)) { + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + trans->write_locked = false; + } +} + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -276,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); } -static noinline int -bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, - unsigned long trace_ip) -{ - return drop_locks_do(trans, - bch2_journal_preres_get(&trans->c->journal, - &trans->journal_preres, - trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK))); -} - static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { @@ -321,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans, return 0; } +noinline static int +btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned new_u64s) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_i *new_k; + int ret; + + bch2_trans_unlock_write(trans); + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(path->btree_id), new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + ret = bch2_trans_relock(trans) ?: + bch2_trans_lock_write(trans); + if (unlikely(ret)) { + kfree(new_k); + return ret; + } + + memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, struct btree_path *path, unsigned u64s) { @@ -347,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; - } + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + if (unlikely(!new_k)) + return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); trans_for_each_update(trans, i) if (i->old_v == &ck->k->v) @@ -732,37 +804,6 @@ revert_fs_usage: return ret; } -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int trans_lock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); - } - - return 0; -} - static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { struct btree_insert_entry *i; @@ -830,15 +871,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags } } - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); - if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) - ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); - if (unlikely(ret)) - return ret; - - ret = trans_lock_write(trans); + ret = bch2_trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -847,10 +880,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); - trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_trans_unlock_write(trans); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, @@ -1003,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; struct btree_write_buffered_key *wb; - unsigned u64s; int ret = 0; if (!trans->nr_updates && @@ -1063,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - trans->journal_u64s = trans->extra_journal_entries.nr; - trans->journal_preres_u64s = 0; - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); @@ -1085,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - /* we're going to journal the key being updated: */ - u64s = jset_u64s(i->k->k.u64s); - if (i->cached && - likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) - trans->journal_preres_u64s += u64s; - if (i->flags & BTREE_UPDATE_NOJOURNAL) continue; - trans->journal_u64s += u64s; + /* we're going to journal the key being updated: */ + trans->journal_u64s += jset_u64s(i->k->k.u64s); /* and we're also going to log the overwrite: */ if (trans->journal_transaction_names) @@ -1126,8 +1145,6 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 941841a0c5bf..60453ba86c4b 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,7 +5,7 @@ #include <linux/list.h> #include <linux/rhashtable.h> -//#include "bkey_methods.h" +#include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" #include "errcode.h" @@ -312,31 +312,6 @@ struct btree_iter { #endif }; -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; - -struct btree_key_cache { - struct mutex lock; - struct rhashtable table; - bool table_init_done; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct shrinker *shrink; - unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - - atomic_long_t nr_freed; - atomic_long_t nr_keys; - atomic_long_t nr_dirty; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - #define BKEY_CACHED_ACCESSED 0 #define BKEY_CACHED_DIRTY 1 @@ -352,7 +327,6 @@ struct bkey_cached { struct rhash_head hash; struct list_head list; - struct journal_preres res; struct journal_entry_pin journal; u64 seq; @@ -389,11 +363,7 @@ struct btree_insert_entry { unsigned long ip_allocated; }; -#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 -#else -#define BTREE_ITER_MAX 32 -#endif struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); @@ -434,6 +404,7 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; + bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; unsigned long last_begin_ip; @@ -465,11 +436,9 @@ struct btree_trans { struct journal_entry_pin *journal_pin; struct journal_res journal_res; - struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; unsigned journal_u64s; - unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; }; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 324767c0ddcc..2fd3c8cc6f51 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -186,8 +186,11 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, enum btree_id btree_id = iter->btree_id; struct bkey_i *update; struct bpos new_start = bkey_start_pos(new.k); - bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); - bool back_split = bkey_gt(old.k->p, new.k->p); + unsigned front_split = bkey_lt(bkey_start_pos(old.k), new_start); + unsigned back_split = bkey_gt(old.k->p, new.k->p); + unsigned middle_split = (front_split || back_split) && + old.k->p.snapshot != new.k->p.snapshot; + unsigned nr_splits = front_split + back_split + middle_split; int ret = 0, compressed_sectors; /* @@ -195,10 +198,9 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, * so that __bch2_trans_commit() can increase our disk * reservation: */ - if (((front_split && back_split) || - ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && + if (nr_splits > 1 && (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_journal_res += compressed_sectors; + trans->extra_journal_res += compressed_sectors * (nr_splits - 1); if (front_split) { update = bch2_bkey_make_mut_noupdate(trans, old); @@ -216,8 +218,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, } /* If we're overwriting in a different snapshot - middle split: */ - if (old.k->p.snapshot != new.k->p.snapshot && - (front_split || back_split)) { + if (middle_split) { update = bch2_bkey_make_mut_noupdate(trans, old); if ((ret = PTR_ERR_OR_ZERO(update))) return ret; @@ -554,6 +555,19 @@ int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, BTREE_UPDATE_PREJOURNAL); } +static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_copy(n, k); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k) @@ -564,6 +578,9 @@ int __must_check bch2_trans_update_buffered(struct btree_trans *trans, EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + if (unlikely(trans->journal_replay_not_finished)) + return bch2_btree_insert_clone_trans(trans, btree, k); + trans_for_each_wb_update(trans, i) { if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { bkey_copy(&i->k, k); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 39c2db68123b..239fcc3c7c99 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -99,7 +99,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) /* Calculate ideal packed bkey format for new btree nodes: */ -void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) +static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) { struct bkey_packed *k; struct bset_tree *t; @@ -125,21 +125,20 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b) return bch2_bkey_format_done(&s); } -static size_t btree_node_u64s_with_format(struct btree *b, +static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, + struct bkey_format *old_f, struct bkey_format *new_f) { - struct bkey_format *old_f = &b->format; - /* stupid integer promotion rules */ ssize_t delta = (((int) new_f->key_u64s - old_f->key_u64s) * - (int) b->nr.packed_keys) + + (int) nr.packed_keys) + (((int) new_f->key_u64s - BKEY_U64s) * - (int) b->nr.unpacked_keys); + (int) nr.unpacked_keys); - BUG_ON(delta + b->nr.live_u64s < 0); + BUG_ON(delta + nr.live_u64s < 0); - return b->nr.live_u64s + delta; + return nr.live_u64s + delta; } /** @@ -147,16 +146,18 @@ static size_t btree_node_u64s_with_format(struct btree *b, * * @c: filesystem handle * @b: btree node to rewrite + * @nr: number of keys for new node (i.e. b->nr) * @new_f: bkey format to translate keys to * * Returns: true if all re-packed keys will be able to fit in a new node. * * Assumes all keys will successfully pack with the new format. */ -bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, +static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, + struct btree_nr_keys nr, struct bkey_format *new_f) { - size_t u64s = btree_node_u64s_with_format(b, new_f); + size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); } @@ -391,7 +392,7 @@ static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, * The keys might expand with the new format - if they wouldn't fit in * the btree node anymore, use the old format for now: */ - if (!bch2_btree_node_format_fits(as->c, b, &format)) + if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format)) format = b->format; SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); @@ -513,8 +514,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * up_read(&c->gc_lock); as->took_gc_lock = false; - bch2_journal_preres_put(&c->journal, &as->journal_preres); - bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); @@ -734,8 +733,6 @@ err: bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_preres_put(&c->journal, &as->journal_preres); - mutex_lock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; @@ -782,9 +779,9 @@ static void btree_interior_update_work(struct work_struct *work) } } -static void btree_update_set_nodes_written(struct closure *cl) +static CLOSURE_CALLBACK(btree_update_set_nodes_written) { - struct btree_update *as = container_of(cl, struct btree_update, cl); + closure_type(as, struct btree_update, cl); struct bch_fs *c = as->c; mutex_lock(&c->btree_interior_update_lock); @@ -1047,7 +1044,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned journal_flags = 0; int ret = 0; u32 restart_count = trans->restart_count; @@ -1061,9 +1057,16 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - journal_flags |= watermark; + if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) && + watermark < c->journal.watermark) { + struct journal_res res = { 0 }; + + ret = drop_locks_do(trans, + bch2_journal_res_get(&c->journal, &res, 1, + watermark|JOURNAL_RES_GET_CHECK)); + if (ret) + return ERR_PTR(ret); + } while (1) { nr_nodes[!!update_level] += 1 + split; @@ -1080,8 +1083,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, break; } + /* + * Always check for space for two keys, even if we won't have to + * split at prior level - it might have been a merge instead: + */ if (bch2_btree_node_insert_fits(c, path->l[update_level].b, - BKEY_BTREE_PTR_U64s_MAX * (1 + split))) + BKEY_BTREE_PTR_U64s_MAX * 2)) break; split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); @@ -1129,27 +1136,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret) { - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - goto err; - } - - ret = drop_locks_do(trans, - bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags)); - if (ret == -BCH_ERR_journal_preres_get_blocked) { - trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); - } - if (ret) - goto err; - } - ret = bch2_disk_reservation_get(c, &as->disk_res, (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, @@ -1360,8 +1346,11 @@ static void __btree_split_node(struct btree_update *as, struct bkey_packed *out[2]; struct bkey uk; unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; + struct { unsigned nr_keys, val_u64s; } nr_keys[2]; int i; + memset(&nr_keys, 0, sizeof(nr_keys)); + for (i = 0; i < 2; i++) { BUG_ON(n[i]->nsets != 1); @@ -1383,6 +1372,9 @@ static void __btree_split_node(struct btree_update *as, if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); + + nr_keys[i].nr_keys++; + nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k); } btree_set_min(n[0], b->data->min_key); @@ -1395,6 +1387,12 @@ static void __btree_split_node(struct btree_update *as, bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); n[i]->data->format = bch2_bkey_format_done(&format[i]); + + unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + + nr_keys[i].val_u64s; + if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c)) + n[i]->data->format = b->format; + btree_node_set_format(n[i], n[i]->data->format); } @@ -1837,8 +1835,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_bkey_format_add_pos(&new_s, next->data->max_key); new_f = bch2_bkey_format_done(&new_s); - sib_u64s = btree_node_u64s_with_format(b, &new_f) + - btree_node_u64s_with_format(m, &new_f); + sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + + btree_node_u64s_with_format(m->nr, &m->format, &new_f); if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); @@ -2296,6 +2294,10 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); + struct bch_extent_ptr *ptr; + bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, + !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); out: diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 4df21512d640..a6668992a272 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -6,10 +6,6 @@ #include "btree_locking.h" #include "btree_update.h" -void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); -bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, - struct bkey_format *); - #define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) @@ -55,7 +51,6 @@ struct btree_update { unsigned update_level; struct disk_reservation disk_res; - struct journal_preres journal_preres; /* * BTREE_INTERIOR_UPDATING_NODE: diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 58d8c6ffd955..5a91d3189fcf 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -854,8 +854,12 @@ static int __mark_pointer(struct btree_trans *trans, return ret; *dst_sectors += sectors; - *bucket_data_type = *dirty_sectors || *cached_sectors - ? ptr_data_type : 0; + + if (!*dirty_sectors && !*cached_sectors) + *bucket_data_type = 0; + else if (*bucket_data_type != BCH_DATA_stripe) + *bucket_data_type = ptr_data_type; + return 0; } @@ -2091,8 +2095,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; - bch2_copygc_stop(c); - if (resize) { down_write(&c->gc_lock); down_write(&ca->bucket_lock); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index a8b148ec2a2b..51af8ea230ed 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -354,8 +354,7 @@ static int attempt_compress(struct bch_fs *c, */ unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); - ZSTD_CCtx *ctx = zstd_init_cctx(workspace, - zstd_cctx_workspace_bound(¶ms.cParams)); + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size); /* * ZSTD requires that when we decompress we pass in the exact @@ -371,7 +370,7 @@ static int attempt_compress(struct bch_fs *c, size_t len = zstd_compress_cctx(ctx, dst + 4, dst_len - 4 - 7, src, src_len, - &c->zstd_params); + ¶ms); if (zstd_is_error(len)) return 0; @@ -572,6 +571,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) size_t decompress_workspace_size = 0; ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); + + /* + * ZSTD is lying: if we allocate the size of the workspace it says it + * requires, it returns memory allocation errors + */ + c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); + struct { unsigned feature; enum bch_compression_type type; @@ -585,13 +591,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, - zstd_cctx_workspace_bound(¶ms.cParams), + c->zstd_workspace_size, zstd_dctx_workspace_bound() }, }, *i; bool have_compressed = false; - c->zstd_params = params; - for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c new file mode 100644 index 000000000000..ac35b8b705ae --- /dev/null +++ b/fs/bcachefs/darray.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/log2.h> +#include <linux/slab.h> +#include "darray.h" + +int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) +{ + if (new_size > d->size) { + new_size = roundup_pow_of_two(new_size); + + void *data = kvmalloc_array(new_size, element_size, gfp); + if (!data) + return -ENOMEM; + + memcpy(data, d->data, d->size * element_size); + if (d->data != d->preallocated) + kvfree(d->data); + d->data = data; + d->size = new_size; + } + + return 0; +} diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 87b4b2d1ec76..e367c625f057 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -8,39 +8,48 @@ * Inspired by CCAN's darray */ -#include "util.h" #include <linux/slab.h> -#define DARRAY(type) \ +#define DARRAY_PREALLOCATED(_type, _nr) \ struct { \ size_t nr, size; \ - type *data; \ + _type *data; \ + _type preallocated[_nr]; \ } -typedef DARRAY(void) darray_void; +#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) -static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) +typedef DARRAY(char) darray_char; + +int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); + +static inline int __darray_resize(darray_char *d, size_t element_size, + size_t new_size, gfp_t gfp) { - if (d->nr + more > d->size) { - size_t new_size = roundup_pow_of_two(d->nr + more); - void *data = krealloc_array(d->data, new_size, t_size, gfp); + return unlikely(new_size > d->size) + ? __bch2_darray_resize(d, element_size, new_size, gfp) + : 0; +} - if (!data) - return -ENOMEM; +#define darray_resize_gfp(_d, _new_size, _gfp) \ + unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)) - d->data = data; - d->size = new_size; - } +#define darray_resize(_d, _new_size) \ + darray_resize_gfp(_d, _new_size, GFP_KERNEL) - return 0; +static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp) +{ + return __darray_resize(d, t_size, d->nr + more, gfp); } #define darray_make_room_gfp(_d, _more, _gfp) \ - __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp) #define darray_make_room(_d, _more) \ darray_make_room_gfp(_d, _more, GFP_KERNEL) +#define darray_room(_d) ((_d).size - (_d).nr) + #define darray_top(_d) ((_d).data[(_d).nr]) #define darray_push_gfp(_d, _item, _gfp) \ @@ -80,13 +89,16 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, #define darray_init(_d) \ do { \ - (_d)->data = NULL; \ - (_d)->nr = (_d)->size = 0; \ + (_d)->nr = 0; \ + (_d)->size = ARRAY_SIZE((_d)->preallocated); \ + (_d)->data = (_d)->size ? (_d)->preallocated : NULL; \ } while (0) #define darray_exit(_d) \ do { \ - kfree((_d)->data); \ + if (!ARRAY_SIZE((_d)->preallocated) || \ + (_d)->data != (_d)->preallocated) \ + kvfree((_d)->data); \ darray_init(_d); \ } while (0) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0771a6d880bf..37d6ecae8c30 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -239,6 +239,34 @@ restart_drop_extra_replicas: next_pos = insert->k.p; + /* + * Check for nonce offset inconsistency: + * This is debug code - we've been seeing this bug rarely, and + * it's been hard to reproduce, so this should give us some more + * information when it does occur: + */ + struct printbuf err = PRINTBUF; + int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); + printbuf_exit(&err); + + if (invalid) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + goto out; + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, @@ -328,7 +356,7 @@ void bch2_data_update_exit(struct data_update *update) bch2_bio_free_pages_pool(c, &update->op.wbio.bio); } -void bch2_update_unwritten_extent(struct btree_trans *trans, +static void bch2_update_unwritten_extent(struct btree_trans *trans, struct data_update *update) { struct bch_fs *c = update->op.c; @@ -408,7 +436,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans, } } +int bch2_extent_drop_ptrs(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct data_update_opts data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + n = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + while (data_opts.kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); + data_opts.kill_ptrs ^= 1U << drop; + } + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + n->k.size = 0; + + return bch2_trans_relock(trans) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + int bch2_data_update_init(struct btree_trans *trans, + struct btree_iter *iter, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, @@ -424,7 +496,7 @@ int bch2_data_update_init(struct btree_trans *trans, const struct bch_extent_ptr *ptr; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; unsigned ptrs_locked = 0; - int ret; + int ret = 0; bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -450,6 +522,8 @@ int bch2_data_update_init(struct btree_trans *trans, bkey_for_each_ptr(ptrs, ptr) percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); + unsigned durability_have = 0, durability_removing = 0; + i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { bool locked; @@ -461,8 +535,11 @@ int bch2_data_update_init(struct btree_trans *trans, reserve_sectors += k.k->size; m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - } else if (!p.ptr.cached) { + durability_removing += bch2_extent_ptr_desired_durability(c, &p); + } else if (!p.ptr.cached && + !((1U << i) & m->data_opts.kill_ptrs)) { bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); } /* @@ -483,7 +560,8 @@ int bch2_data_update_init(struct btree_trans *trans, move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || - !atomic_read(&ctxt->read_sectors)); + (!atomic_read(&ctxt->read_sectors) && + !atomic_read(&ctxt->write_sectors))); if (!locked) bch2_bucket_nocow_lock(&c->nocow_locks, @@ -501,6 +579,30 @@ int bch2_data_update_init(struct btree_trans *trans, i++; } + /* + * If current extent durability is less than io_opts.data_replicas, + * we're not trying to rereplicate the extent up to data_replicas here - + * unless extra_replicas was specified + * + * Increasing replication is an explicit operation triggered by + * rereplicate, currently, so that users don't get an unexpected -ENOSPC + */ + if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && + durability_have >= io_opts.data_replicas) { + m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; + m->data_opts.rewrite_ptrs = 0; + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); + goto done; + } + + m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + + m->data_opts.extra_replicas; + m->op.nr_replicas_required = m->op.nr_replicas; + + BUG_ON(!m->op.nr_replicas); + if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas @@ -510,14 +612,11 @@ int bch2_data_update_init(struct btree_trans *trans, goto err; } - m->op.nr_replicas += m->data_opts.extra_replicas; - m->op.nr_replicas_required = m->op.nr_replicas; - - BUG_ON(!m->op.nr_replicas); + if (bkey_extent_is_unwritten(k)) { + bch2_update_unwritten_extent(trans, m); + goto done; + } - /* Special handling required: */ - if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_unwritten_extent_update; return 0; err: i = 0; @@ -532,6 +631,9 @@ err: bch2_bkey_buf_exit(&m->k, c); bch2_bio_free_pages_pool(c, &m->op.wbio.bio); return ret; +done: + bch2_data_update_exit(m); + return ret ?: -BCH_ERR_data_update_done; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 9dc17b9d8379..991095bbd469 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -32,9 +32,14 @@ int bch2_data_update_index_update(struct bch_write_op *); void bch2_data_update_read_done(struct data_update *, struct bch_extent_crc_unpacked); +int bch2_extent_drop_ptrs(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c, + struct data_update_opts); + void bch2_data_update_exit(struct data_update *); -void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *); -int bch2_data_update_init(struct btree_trans *, struct moving_context *, +int bch2_data_update_init(struct btree_trans *, struct btree_iter *, + struct moving_context *, struct data_update *, struct write_point_specifier, struct bch_io_opts, struct data_update_opts, diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 1a0f2d571569..2bfff0da7000 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -485,20 +485,15 @@ retry: return ret; } -int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) +int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir.inum, 0, snapshot), - POS(dir.inum, U64_MAX), 0, k, ret) + SPOS(dir, 0, snapshot), + POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { ret = -ENOTEMPTY; break; @@ -508,6 +503,14 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) return ret; } +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) +{ + u32 snapshot; + + return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: + bch2_empty_dir_snapshot(trans, dir.inum, snapshot); +} + int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans *trans = bch2_trans_get(c); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index cd262bf4d9c5..1e3431990abd 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -64,6 +64,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *); +int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index d613695abf9f..4d0cb0ccff32 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) case TARGET_DEV: { struct bch_dev *ca; + out->atomic++; rcu_read_lock(); ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) @@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } rcu_read_unlock(); + out->atomic--; break; } case TARGET_GROUP: @@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } } -void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct target t = target_decode(v); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 875f7c5a6fca..2a77de18c004 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->nr_active_devs++; rcu_read_unlock(); + + /* + * If we only have redundancy + 1 devices, we're better off with just + * replication: + */ + if (h->nr_active_devs < h->redundancy + 2) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", + h->nr_active_devs, h->redundancy + 2); + list_add(&h->list, &c->ec_stripe_head_list); return h; } @@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); found: + if (!IS_ERR_OR_NULL(h) && + h->nr_active_devs < h->redundancy + 2) { + mutex_unlock(&h->lock); + h = NULL; + } mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, int ret; h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); - if (!h) - bch_err(c, "no stripe head"); if (IS_ERR_OR_NULL(h)) return h; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 68a1a96bb7ca..9ce29681eec9 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -95,6 +95,7 @@ x(ENOSPC, ENOSPC_sb_members) \ x(ENOSPC, ENOSPC_sb_members_v2) \ x(ENOSPC, ENOSPC_sb_crypt) \ + x(ENOSPC, ENOSPC_sb_downgrade) \ x(ENOSPC, ENOSPC_btree_slot) \ x(ENOSPC, ENOSPC_snapshot_tree) \ x(ENOENT, ENOENT_bkey_type_mismatch) \ @@ -162,7 +163,7 @@ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ x(0, restart_recovery) \ - x(0, unwritten_extent_update) \ + x(0, data_update_done) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ @@ -210,6 +211,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ + x(BCH_ERR_invalid_sb, invalid_replicas_entry) \ x(BCH_ERR_invalid_sb, invalid_sb_journal) \ x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ @@ -217,6 +219,8 @@ x(BCH_ERR_invalid_sb, invalid_sb_quota) \ x(BCH_ERR_invalid_sb, invalid_sb_errors) \ x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ + x(BCH_ERR_invalid_sb, invalid_sb_ext) \ + x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 7b28d37922fd..25cf78a7b946 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -152,6 +152,9 @@ int bch2_fsck_err(struct bch_fs *c, struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; + if (test_bit(err, c->sb.errors_silent)) + return -BCH_ERR_fsck_fix; + bch2_sb_error_count(c, err); va_start(args, fmt); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index d167d65986e0..fec17d1353d1 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -157,6 +157,7 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) +__printf(4, 0) static inline void bch2_bkey_fsck_err(struct bch_fs *c, struct printbuf *err_msg, enum bch_sb_error_id err_type, @@ -167,7 +168,6 @@ static inline void bch2_bkey_fsck_err(struct bch_fs *c, va_start(args, fmt); prt_vprintf(err_msg, fmt, args); va_end(args); - } #define bkey_fsck_err(c, _err_msg, _err_type, ...) \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index a864de231b69..9d8afcb5979a 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -649,37 +649,31 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return replicas; } -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) { - struct bch_dev *ca; - if (p->ptr.cached) return 0; - ca = bch_dev_bkey_exists(c, p->ptr.dev); - - return ca->mi.durability + - (p->has_ec - ? p->ec.redundancy - : 0); + return p->has_ec + ? p->ec.redundancy + 1 + : ca->mi.durability; } -unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca; + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); - if (p->ptr.cached) - return 0; + return __extent_ptr_durability(ca, p); +} - ca = bch_dev_bkey_exists(c, p->ptr.dev); +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); if (ca->mi.state == BCH_MEMBER_STATE_failed) return 0; - return ca->mi.durability + - (p->has_ec - ? p->ec.redundancy - : 0); + return __extent_ptr_durability(ca, p); } unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) @@ -1300,7 +1294,8 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, unsigned i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { rewrite_ptrs = 0; goto incompressible; } diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 5b42a76c4796..84e20c3ada6c 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty) } } -static void bch2_dio_read_complete(struct closure *cl) +static CLOSURE_CALLBACK(bch2_dio_read_complete) { - struct dio_read *dio = container_of(cl, struct dio_read, cl); + closure_type(dio, struct dio_read, cl); dio->req->ki_complete(dio->req, dio->ret); bio_check_or_release(&dio->rbio.bio, dio->should_dirty); @@ -216,11 +216,11 @@ struct dio_write { struct address_space *mapping; struct bch_inode_info *inode; struct mm_struct *mm; + const struct iovec *iov; unsigned loop:1, extending:1, sync:1, - flush:1, - free_iov:1; + flush:1; struct quota_res quota_res; u64 written; @@ -312,12 +312,10 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) return -1; if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + dio->iov = iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), GFP_KERNEL); if (unlikely(!iov)) return -ENOMEM; - - dio->free_iov = true; } memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); @@ -325,9 +323,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) return 0; } -static void bch2_dio_write_flush_done(struct closure *cl) +static CLOSURE_CALLBACK(bch2_dio_write_flush_done) { - struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + closure_type(dio, struct dio_write, op.cl); struct bch_fs *c = dio->op.c; closure_debug_destroy(cl); @@ -381,8 +379,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) bch2_pagecache_block_put(inode); - if (dio->free_iov) - kfree(dio->iter.__iov); + kfree(dio->iov); ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); @@ -626,11 +623,11 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) dio->mapping = mapping; dio->inode = inode; dio->mm = current->mm; + dio->iov = NULL; dio->loop = false; dio->extending = extending; dio->sync = is_sync_kiocb(req) || extending; dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; - dio->free_iov = false; dio->quota_res.sectors = 0; dio->written = 0; dio->iter = *iter; diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 8bd9bcdd27f7..ff664fd0d8ef 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -13,7 +13,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, loff_t start, u64 end, - int fgp_flags, gfp_t gfp, + fgf_t fgp_flags, gfp_t gfp, folios *fs) { struct folio *f; diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index a2222ad586e9..27f712ae37a6 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -7,7 +7,7 @@ typedef DARRAY(struct folio *) folios; int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, - u64, int, gfp_t, folios *); + u64, fgf_t, gfp_t, folios *); int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); /* diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 5a39bcb597a3..94e5a567fa44 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -100,7 +100,8 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, + ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); @@ -183,13 +184,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_set_projid(c, inode, fa.fsx_projid); - if (ret) - goto err_unlock; - - ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + bch2_set_projid(c, inode, fa.fsx_projid) ?: + bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); -err_unlock: mutex_unlock(&inode->ei_update_lock); err: inode_unlock(&inode->v); @@ -291,14 +289,14 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(c->vfs_sb->s_bdev); + ret = bdev_freeze(c->vfs_sb->s_bdev); if (ret) goto err; bch2_journal_flush(&c->journal); c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); - thaw_bdev(c->vfs_sb->s_bdev); + bdev_thaw(c->vfs_sb->s_bdev); break; case FSOP_GOING_FLAGS_LOGFLUSH: @@ -413,7 +411,7 @@ retry: if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && !arg.src_ptr) - snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; + snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 166d8d8abe68..49da8db1d9e9 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -258,7 +258,8 @@ __bch2_create(struct mnt_idmap *idmap, retry: bch2_trans_begin(trans); - ret = bch2_create_trans(trans, + ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?: + bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, @@ -430,7 +431,9 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, lockdep_assert_held(&inode->v.i_rwsem); - ret = __bch2_link(c, inode, dir, dentry); + ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: + bch2_subvol_is_ro(c, inode->ei_subvol) ?: + __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return ret; @@ -481,7 +484,11 @@ err: static int bch2_unlink(struct inode *vdir, struct dentry *dentry) { - return __bch2_unlink(vdir, dentry, false); + struct bch_inode_info *dir= to_bch_ei(vdir); + struct bch_fs *c = dir->v.i_sb->s_fs_info; + + return bch2_subvol_is_ro(c, dir->ei_subvol) ?: + __bch2_unlink(vdir, dentry, false); } static int bch2_symlink(struct mnt_idmap *idmap, @@ -562,6 +569,11 @@ static int bch2_rename2(struct mnt_idmap *idmap, src_inode, dst_inode); + ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?: + bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol); + if (ret) + goto err; + if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, src_inode, dst_dir->ei_qid, @@ -783,11 +795,13 @@ static int bch2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret; lockdep_assert_held(&inode->v.i_rwsem); - ret = setattr_prepare(idmap, dentry, iattr); + ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + setattr_prepare(idmap, dentry, iattr); if (ret) return ret; @@ -1010,12 +1024,26 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) return bch2_err_class(ret); } +static int bch2_open(struct inode *vinode, struct file *file) +{ + if (file->f_flags & (O_WRONLY|O_RDWR)) { + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + int ret = bch2_subvol_is_ro(c, inode->ei_subvol); + if (ret) + return ret; + } + + return generic_file_open(vinode, file); +} + static const struct file_operations bch_file_operations = { + .open = bch2_open, .llseek = bch2_llseek, .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, - .open = generic_file_open, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, @@ -1143,24 +1171,33 @@ static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_inode_info *dir = to_bch_ei(vdir); - - if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) - return FILEID_INVALID; + int min_len; if (!S_ISDIR(inode->v.i_mode) && dir) { struct bcachefs_fid_with_parent *fid = (void *) fh; + min_len = sizeof(*fid) / sizeof(u32); + if (*len < min_len) { + *len = min_len; + return FILEID_INVALID; + } + fid->fid = bch2_inode_to_fid(inode); fid->dir = bch2_inode_to_fid(dir); - *len = sizeof(*fid) / sizeof(u32); + *len = min_len; return FILEID_BCACHEFS_WITH_PARENT; } else { struct bcachefs_fid *fid = (void *) fh; + min_len = sizeof(*fid) / sizeof(u32); + if (*len < min_len) { + *len = min_len; + return FILEID_INVALID; + } *fid = bch2_inode_to_fid(inode); - *len = sizeof(*fid) / sizeof(u32); + *len = min_len; return FILEID_BCACHEFS_WITHOUT_PARENT; } } @@ -1667,8 +1704,7 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) if (!first) seq_putc(seq, ':'); first = false; - seq_puts(seq, "/dev/"); - seq_puts(seq, ca->name); + seq_puts(seq, ca->disk_sb.sb_name); } return 0; @@ -1734,6 +1770,9 @@ static int bch2_unfreeze(struct super_block *sb) struct bch_fs *c = sb->s_fs_info; int ret; + if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + return 0; + down_write(&c->state_lock); ret = bch2_fs_read_write(c); up_write(&c->state_lock); @@ -1922,10 +1961,7 @@ out: return dget(sb->s_root); err_put_super: - sb->s_fs_info = NULL; - c->vfs_sb = NULL; deactivate_locked_super(sb); - bch2_fs_stop(c); return ERR_PTR(bch2_err_class(ret)); } @@ -1933,11 +1969,8 @@ static void bch2_kill_sb(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; - if (c) - c->vfs_sb = NULL; generic_shutdown_super(sb); - if (c) - bch2_fs_free(c); + bch2_fs_free(c); } static struct file_system_type bcache_fs_type = { diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9f3e9bd3d767..e0c5cd119acc 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r) const struct nlink *l = _l; const struct nlink *r = _r; - return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); + return cmp_int(l->inum, r->inum); } static void inc_link(struct bch_fs *c, struct snapshots_seen *s, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index def77f2d8802..9309cfeecd8d 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -7,6 +7,7 @@ #include "btree_update.h" #include "buckets.h" #include "compress.h" +#include "dirent.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -1093,11 +1094,15 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) goto out; - if (fsck_err_on(S_ISDIR(inode.bi_mode), c, - deleted_inode_is_dir, - "directory %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; + if (S_ISDIR(inode.bi_mode)) { + ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); + if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, + "non empty directory %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + if (ret) + goto out; + } if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, deleted_inode_not_unlinked, @@ -1134,7 +1139,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, * unlinked inodes in the snapshot leaves: */ *need_another_pass = true; - return 0; + goto out; } ret = 1; @@ -1169,8 +1174,10 @@ again: */ for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p, - &need_another_pass)); + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); if (ret < 0) break; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index a56ed553dc15..36763865facd 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -209,7 +209,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - ret = bch2_data_update_init(trans, NULL, &op->write, + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), opts, (struct data_update_opts) { diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f02b3f7d26a0..8c8cb1541ac9 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -580,9 +580,9 @@ static inline void wp_update_state(struct write_point *wp, bool running) __wp_update_state(wp, state); } -static void bch2_write_index(struct closure *cl) +static CLOSURE_CALLBACK(bch2_write_index) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); struct write_point *wp = op->wp; struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; @@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) * checksum: */ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum)) + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return -EIO; ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); @@ -1208,14 +1208,20 @@ static void __bch2_nocow_write_done(struct bch_write_op *op) bch2_nocow_write_convert_unwritten(op); } -static void bch2_nocow_write_done(struct closure *cl) +static CLOSURE_CALLBACK(bch2_nocow_write_done) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); __bch2_nocow_write_done(op); bch2_write_done(cl); } +struct bucket_to_lock { + struct bpos b; + unsigned gen; + struct nocow_lock_bucket *l; +}; + static void bch2_nocow_write(struct bch_write_op *op) { struct bch_fs *c = op->c; @@ -1224,18 +1230,16 @@ static void bch2_nocow_write(struct bch_write_op *op) struct bkey_s_c k; struct bkey_ptrs_c ptrs; const struct bch_extent_ptr *ptr; - struct { - struct bpos b; - unsigned gen; - struct nocow_lock_bucket *l; - } buckets[BCH_REPLICAS_MAX]; - unsigned nr_buckets = 0; + DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; + struct bucket_to_lock *i; u32 snapshot; - int ret, i; + struct bucket_to_lock *stale_at; + int ret; if (op->flags & BCH_WRITE_MOVE) return; + darray_init(&buckets); trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); @@ -1250,7 +1254,7 @@ retry: while (1) { struct bio *bio = &op->wbio.bio; - nr_buckets = 0; + buckets.nr = 0; k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); @@ -1263,26 +1267,26 @@ retry: break; if (bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - k.k->u64s)) + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + k.k->u64s)) break; /* Get iorefs before dropping btree locks: */ ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); - buckets[nr_buckets].gen = ptr->gen; - buckets[nr_buckets].l = - bucket_nocow_lock(&c->nocow_locks, - bucket_to_u64(buckets[nr_buckets].b)); - - prefetch(buckets[nr_buckets].l); + struct bpos b = PTR_BUCKET_POS(c, ptr); + struct nocow_lock_bucket *l = + bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); + prefetch(l); if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) goto err_get_ioref; - nr_buckets++; + /* XXX allocating memory with btree locks held - rare */ + darray_push_gfp(&buckets, ((struct bucket_to_lock) { + .b = b, .gen = ptr->gen, .l = l, + }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; @@ -1296,21 +1300,21 @@ retry: if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - for (i = 0; i < nr_buckets; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); - struct nocow_lock_bucket *l = buckets[i].l; - bool stale; + darray_for_each(buckets, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->b.inode); - __bch2_bucket_nocow_lock(&c->nocow_locks, l, - bucket_to_u64(buckets[i].b), + __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, + bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); rcu_read_lock(); - stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); + bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen); rcu_read_unlock(); - if (unlikely(stale)) + if (unlikely(stale)) { + stale_at = i; goto err_bucket_stale; + } } bio = &op->wbio.bio; @@ -1346,15 +1350,14 @@ err: if (ret) { bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s: btree lookup error %s", - __func__, bch2_err_str(ret)); + op->pos.inode, op->pos.offset << 9, + "%s: btree lookup error %s", __func__, bch2_err_str(ret)); op->error = ret; op->flags |= BCH_WRITE_DONE; } bch2_trans_put(trans); + darray_exit(&buckets); /* fallback to cow write path? */ if (!(op->flags & BCH_WRITE_DONE)) { @@ -1363,7 +1366,7 @@ err: op->insert_keys.top = op->insert_keys.keys; } else if (op->flags & BCH_WRITE_SYNC) { closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl); + bch2_nocow_write_done(&op->cl.work); } else { /* * XXX @@ -1374,24 +1377,21 @@ err: } return; err_get_ioref: - for (i = 0; i < nr_buckets; i++) - percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + darray_for_each(buckets, i) + percpu_ref_put(&bch_dev_bkey_exists(c, i->b.inode)->io_ref); /* Fall back to COW path: */ goto out; err_bucket_stale: - while (i >= 0) { - bch2_bucket_nocow_unlock(&c->nocow_locks, - buckets[i].b, - BUCKET_NOCOW_LOCK_UPDATE); - --i; + darray_for_each(buckets, i) { + bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); + if (i == stale_at) + break; } - for (i = 0; i < nr_buckets; i++) - percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); /* We can retry this: */ ret = -BCH_ERR_transaction_restart; - goto out; + goto err_get_ioref; } static void __bch2_write(struct bch_write_op *op) @@ -1566,9 +1566,9 @@ err: * If op->discard is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode. */ -void bch2_write(struct closure *cl) +CLOSURE_CALLBACK(bch2_write) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; unsigned data_len; diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 9323167229ee..6c276a48f95d 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -90,8 +90,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->devs_need_flush = NULL; } -void bch2_write(struct closure *); - +CLOSURE_CALLBACK(bch2_write); void bch2_write_point_do_index_updates(struct work_struct *); static inline struct bch_write_bio *wbio_init(struct bio *bio) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 5b5d69f2316b..8cf238be6213 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -249,7 +249,7 @@ static bool journal_entry_want_write(struct journal *j) return ret; } -static bool journal_entry_close(struct journal *j) +bool bch2_journal_entry_close(struct journal *j) { bool ret; @@ -321,6 +321,8 @@ static int journal_entry_open(struct journal *j) atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); @@ -381,7 +383,7 @@ static bool journal_quiesced(struct journal *j) bool ret = atomic64_read(&j->seq) == j->seq_ondisk; if (!ret) - journal_entry_close(j); + bch2_journal_entry_close(j); return ret; } @@ -434,7 +436,7 @@ retry: /* * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call journal_entry_close() + * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { @@ -526,36 +528,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, return ret; } -/* journal_preres: */ - -static bool journal_preres_available(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); - - if (!ret && mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - - return ret; -} - -int __bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - int ret; - - closure_wait_event(&j->preres_wait, - (ret = bch2_journal_error(j)) || - journal_preres_available(j, res, new_u64s, flags)); - return ret; -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *j, @@ -1069,7 +1041,7 @@ void bch2_fs_journal_stop(struct journal *j) bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); - wait_event(j->wait, journal_entry_close(j)); + wait_event(j->wait, bch2_journal_entry_close(j)); /* * Always write a new journal entry, to make sure the clock hands are up @@ -1306,7 +1278,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 011711e99c8d..2f768e11aec9 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -136,9 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) static inline u64 journal_cur_seq(struct journal *j) { - EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); - - return j->pin.back - 1; + return atomic64_read(&j->seq); } static inline u64 journal_last_unwritten_seq(struct journal *j) @@ -268,6 +266,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u return s; } +bool bch2_journal_entry_close(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64, bool); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) @@ -395,104 +394,6 @@ out: return 0; } -/* journal_preres: */ - -static inline void journal_set_watermark(struct journal *j) -{ - union journal_preres_state s = READ_ONCE(j->prereserved); - unsigned watermark = BCH_WATERMARK_stripe; - - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (fifo_free(&j->pin) < j->pin.size / 8) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (s.reserved > s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (!s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static inline void bch2_journal_preres_put(struct journal *j, - struct journal_preres *res) -{ - union journal_preres_state s = { .reserved = res->u64s }; - - if (!res->u64s) - return; - - s.v = atomic64_sub_return(s.v, &j->prereserved.counter); - res->u64s = 0; - - if (unlikely(s.waiting)) { - clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), - (unsigned long *) &j->prereserved.v); - closure_wake_up(&j->preres_wait); - } - - if (s.reserved <= s.remaining && j->watermark) - journal_set_watermark(j); -} - -int __bch2_journal_preres_get(struct journal *, - struct journal_preres *, unsigned, unsigned); - -static inline int bch2_journal_preres_get_fast(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags, - bool set_waiting) -{ - int d = new_u64s - res->u64s; - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret; - - do { - old.v = new.v = v; - ret = 0; - - if (watermark == BCH_WATERMARK_reclaim || - new.reserved + d < new.remaining) { - new.reserved += d; - ret = 1; - } else if (set_waiting && !new.waiting) - new.waiting = true; - else - return 0; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); - - if (ret) - res->u64s += d; - return ret; -} - -static inline int bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - if (new_u64s <= res->u64s) - return 0; - - if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) - return 0; - - if (flags & JOURNAL_RES_GET_NONBLOCK) - return -BCH_ERR_journal_preres_get_blocked; - - return __bch2_journal_preres_get(j, res, new_u64s, flags); -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *, diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index f4bc2cdbfdd7..3eb6c3f62a81 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -408,8 +408,10 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, - version, big_endian, flags); + ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, + version, big_endian, flags); + if (ret == FSCK_DELETED_KEY) + ret = 0; fsck_err: return ret; } @@ -547,6 +549,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + struct printbuf err = PRINTBUF; int ret = 0; if (journal_entry_err_on(bytes < sizeof(*u) || @@ -555,10 +558,19 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, journal_entry_data_usage_bad_size, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); - return ret; + goto out; } + if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), + c, version, jset, entry, + journal_entry_data_usage_bad_size, + "invalid journal entry usage: %s", err.buf)) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; + } +out: fsck_err: + printbuf_exit(&err); return ret; } @@ -1025,10 +1037,9 @@ next_block: return 0; } -static void bch2_journal_read_device(struct closure *cl) +static CLOSURE_CALLBACK(bch2_journal_read_device) { - struct journal_device *ja = - container_of(cl, struct journal_device, read); + closure_type(ja, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); struct bch_fs *c = ca->fs; struct journal_list *jlist = @@ -1079,6 +1090,12 @@ found: if (ja->bucket_seq[ja->cur_idx] && ja->sectors_free == ca->mi.bucket_size) { +#if 0 + /* + * Debug code for ZNS support, where we (probably) want to be + * correlated where we stopped in the journal to the zone write + * points: + */ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); for (i = 0; i < 3; i++) { @@ -1086,6 +1103,7 @@ found: bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); } +#endif ja->sectors_free = 0; } @@ -1513,9 +1531,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); } -static void journal_write_done(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_done) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; @@ -1583,6 +1601,7 @@ static void journal_write_done(struct closure *cl) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); closure_wake_up(&w->wait); @@ -1631,9 +1650,9 @@ static void journal_write_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } -static void do_journal_write(struct closure *cl) +static CLOSURE_CALLBACK(do_journal_write) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); @@ -1843,9 +1862,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * return 0; } -void bch2_journal_write(struct closure *cl) +CLOSURE_CALLBACK(bch2_journal_write) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index a88d097b13f1..c035e7c108e1 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -60,6 +60,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); -void bch2_journal_write(struct closure *); +CLOSURE_CALLBACK(bch2_journal_write); #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 9a584aaaa2eb..ec712104addb 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +static inline void journal_set_watermark(struct journal *j, bool low_on_space) { - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); + unsigned watermark = BCH_WATERMARK_stripe; - do { - old.v = new.v = v; - new.remaining = u64s_remaining; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); + if (low_on_space) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static struct journal_space @@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; unsigned clean, clean_ondisk, total; - s64 u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); unsigned i, nr_online = 0, nr_devs_want; @@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - u64s_remaining = (u64) clean << 6; - u64s_remaining -= (u64) total << 3; - u64s_remaining = max(0LL, u64s_remaining); - u64s_remaining /= 4; - u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); + journal_set_watermark(j, clean * 4 <= total); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; - journal_set_remaining(j, u64s_remaining); - journal_set_watermark(j); if (!ret) journal_wake(j); @@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - /* And include pre-reservations: */ - nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, - (ca->mi.bucket_size << 6) - - journal_entry_overhead(j)); - nr_buckets = min(nr_buckets, ja->nr); bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; @@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) msecs_to_jiffies(c->opts.journal_reclaim_delay))) min_nr = 1; - if (j->prereserved.reserved * 4 > j->prereserved.remaining) - min_nr = 1; - - if (fifo_free(&j->pin) <= 32) + if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) @@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - j->prereserved.reserved, - j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), c->btree_cache.used, atomic_long_read(&c->btree_key_cache.nr_dirty), @@ -788,6 +776,9 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, (1U << JOURNAL_PIN_btree), 0, 0, 0)) *did_work = true; + if (seq_to_flush > journal_cur_seq(j)) + bch2_journal_entry_close(j); + spin_lock(&j->lock); /* * If journal replay hasn't completed, the unreplayed journal entries diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 42504e16acb6..a756b69582e3 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -76,14 +76,6 @@ struct journal_res { u64 seq; }; -/* - * For reserving space in the journal prior to getting a reservation on a - * particular journal entry: - */ -struct journal_preres { - unsigned u64s; -}; - union journal_res_state { struct { atomic64_t counter; @@ -104,22 +96,6 @@ union journal_res_state { }; }; -union journal_preres_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 waiting:1, - reserved:31, - remaining:32; - }; -}; - /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ @@ -180,8 +156,6 @@ struct journal { union journal_res_state reservations; enum bch_watermark watermark; - union journal_preres_state prereserved; - } __aligned(SMP_CACHE_BYTES); unsigned long flags; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ab749bf2fcbc..54830ee0ed88 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -49,17 +49,6 @@ static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) } } -static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) -{ - if (trace_move_extent_alloc_mem_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_alloc_mem_fail(c, buf.buf); - printbuf_exit(&buf); - } -} - struct moving_io { struct list_head read_list; struct list_head io_list; @@ -163,12 +152,18 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->write_sectors) != sectors_pending); } +static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + bch2_trans_unlock_long(ctxt->trans); + closure_sync(&ctxt->cl); +} + void bch2_moving_ctxt_exit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); - closure_sync(&ctxt->cl); + bch2_moving_ctxt_flush_all(ctxt); EBUG_ON(atomic_read(&ctxt->write_sectors)); EBUG_ON(atomic_read(&ctxt->write_ios)); @@ -223,49 +218,6 @@ void bch2_move_stats_init(struct bch_move_stats *stats, char *name) scnprintf(stats->name, sizeof(stats->name), "%s", name); } -static int bch2_extent_drop_ptrs(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct data_update_opts data_opts) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - n = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - while (data_opts.kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - struct bch_extent_ptr *ptr; - - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); - data_opts.kill_ptrs ^= 1U << drop; - } - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k)) - n->k.size = 0; - - return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); -} - int bch2_move_extent(struct moving_context *ctxt, struct move_bucket_in_flight *bucket_in_flight, struct btree_iter *iter, @@ -335,19 +287,11 @@ int bch2_move_extent(struct moving_context *ctxt, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, io_opts, data_opts, iter->btree_id, k); - if (ret && ret != -BCH_ERR_unwritten_extent_update) + if (ret) goto err_free_pages; - if (ret == -BCH_ERR_unwritten_extent_update) { - bch2_update_unwritten_extent(trans, &io->write); - move_free(io); - return 0; - } - - BUG_ON(ret); - io->write.op.end_io = move_write_done; if (ctxt->rate) @@ -391,8 +335,23 @@ err_free_pages: err_free: kfree(io); err: - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); - trace_move_extent_alloc_mem_fail2(c, k); + if (ret == -BCH_ERR_data_update_done) + return 0; + + if (bch2_err_matches(ret, EROFS) || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); + if (trace_move_extent_start_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, ": "); + prt_str(&buf, bch2_err_str(ret)); + trace_move_extent_start_fail(c, buf.buf); + printbuf_exit(&buf); + } return ret; } @@ -482,37 +441,30 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, int bch2_move_ratelimit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; + bool is_kthread = current->flags & PF_KTHREAD; u64 delay; - if (ctxt->wait_on_copygc && !c->copygc_running) { - bch2_trans_unlock_long(ctxt->trans); + if (ctxt->wait_on_copygc && c->copygc_running) { + bch2_moving_ctxt_flush_all(ctxt); wait_event_killable(c->copygc_running_wq, !c->copygc_running || - kthread_should_stop()); + (is_kthread && kthread_should_stop())); } do { delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; - - if (delay) { - if (delay > HZ / 10) - bch2_trans_unlock_long(ctxt->trans); - else - bch2_trans_unlock(ctxt->trans); - set_current_state(TASK_INTERRUPTIBLE); - } - - if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { - __set_current_state(TASK_RUNNING); + if (is_kthread && kthread_should_stop()) return 1; - } if (delay) - schedule_timeout(delay); + move_ctxt_wait_event_timeout(ctxt, + freezing(current) || + (is_kthread && kthread_should_stop()), + delay); if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + bch2_moving_ctxt_flush_all(ctxt); try_to_freeze(); } } while (delay); @@ -683,6 +635,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; + bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter; struct bkey_buf sk; @@ -728,6 +681,9 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, } while (!(ret = bch2_move_ratelimit(ctxt))) { + if (is_kthread && kthread_should_stop()) + break; + bch2_trans_begin(trans); ret = bch2_get_next_backpointer(trans, bucket, gen, diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 07cf9d42643b..0906aa2d1de2 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -38,6 +38,25 @@ struct moving_context { wait_queue_head_t wait; }; +#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \ +({ \ + int _ret = 0; \ + while (true) { \ + bool cond_finished = false; \ + bch2_moving_ctxt_do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ + bch2_trans_unlock_long((_ctxt)->trans); \ + _ret = __wait_event_timeout((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond)), _timeout); \ + if (_ret || ( cond_finished)) \ + break; \ + } \ + _ret; \ +}) + #define move_ctxt_wait_event(_ctxt, _cond) \ do { \ bool cond_finished = false; \ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 0a0576326c5b..a84e79f79e5e 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -207,7 +207,7 @@ static int bch2_copygc(struct moving_context *ctxt, goto err; darray_for_each(buckets, i) { - if (unlikely(freezing(current))) + if (kthread_should_stop() || freezing(current)) break; f = move_bucket_in_flight_add(buckets_in_flight, *i); diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index 5e653eb81d54..accf246c3233 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: LGPL-2.1+ /* Copyright (C) 2022 Kent Overstreet */ +#include <linux/bitmap.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kernel.h> @@ -423,3 +424,24 @@ void bch2_prt_bitflags(struct printbuf *out, flags ^= BIT_ULL(bit); } } + +void bch2_prt_bitflags_vector(struct printbuf *out, + const char * const list[], + unsigned long *v, unsigned nr) +{ + bool first = true; + unsigned i; + + for (i = 0; i < nr; i++) + if (!list[i]) { + nr = i - 1; + break; + } + + for_each_set_bit(i, v, nr) { + if (!first) + bch2_prt_printf(out, ","); + first = false; + bch2_prt_printf(out, "%s", list[i]); + } +} diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 2191423d9f22..9a4a56c40937 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -124,6 +124,8 @@ void bch2_prt_units_u64(struct printbuf *, u64); void bch2_prt_units_s64(struct printbuf *, s64); void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); +void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], + unsigned long *, unsigned); /* Initializer for a heap allocated printbuf: */ #define PRINTBUF ((struct printbuf) { .heap_allocated = true }) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 9c30500ce920..5cf7d0532002 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -27,6 +27,7 @@ #include "recovery.h" #include "replicas.h" #include "sb-clean.h" +#include "sb-downgrade.h" #include "snapshot.h" #include "subvolume.h" #include "super-io.h" @@ -144,7 +145,7 @@ static int bch2_journal_replay(struct bch_fs *c) u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; size_t i; - int ret; + int ret = 0; move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; @@ -167,6 +168,8 @@ static int bch2_journal_replay(struct bch_fs *c) goto err; } + BUG_ON(!atomic_read(&keys->ref)); + for (i = 0; i < keys->nr; i++) { k = keys_sorted[i]; @@ -188,6 +191,9 @@ static int bch2_journal_replay(struct bch_fs *c) } } + if (!c->opts.keep_journal) + bch2_journal_keys_put_initial(c); + replay_now_at(j, j->replay_journal_seq_end); j->replay_journal_seq = 0; @@ -476,7 +482,7 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) } const char * const bch2_recovery_passes[] = { -#define x(_fn, _when) #_fn, +#define x(_fn, ...) #_fn, BCH_RECOVERY_PASSES() #undef x NULL @@ -499,18 +505,47 @@ struct recovery_pass_fn { }; static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when }, +#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, BCH_RECOVERY_PASSES() #undef x }; -static void check_version_upgrade(struct bch_fs *c) +u64 bch2_recovery_passes_to_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +static bool check_version_upgrade(struct bch_fs *c) { unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); unsigned latest_version = bcachefs_metadata_version_current; unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; - u64 recovery_passes; if (old_version < bcachefs_metadata_required_upgrade_below) { if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || @@ -554,7 +589,7 @@ static void check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, new_version); prt_newline(&buf); - recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); + u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); if (recovery_passes) { if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) prt_str(&buf, "fsck required"); @@ -569,12 +604,13 @@ static void check_version_upgrade(struct bch_fs *c) bch_info(c, "%s", buf.buf); - mutex_lock(&c->sb_lock); bch2_sb_upgrade(c, new_version); - mutex_unlock(&c->sb_lock); printbuf_exit(&buf); + return true; } + + return false; } u64 bch2_fsck_recovery_passes(void) @@ -649,7 +685,6 @@ int bch2_fs_recovery(struct bch_fs *c) struct bch_sb_field_clean *clean = NULL; struct jset *last_journal_entry = NULL; u64 last_seq = 0, blacklist_seq, journal_seq; - bool write_sb = false; int ret = 0; if (c->sb.clean) { @@ -677,15 +712,73 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) - check_version_upgrade(c); - if (c->opts.fsck && c->opts.norecovery) { bch_err(c, "cannot select both norecovery and fsck"); ret = -EINVAL; goto err; } + if (!(c->opts.nochanges && c->opts.norecovery)) { + mutex_lock(&c->sb_lock); + bool write_sb = false; + + struct bch_sb_field_ext *ext = + bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); + if (!ext) { + ret = -BCH_ERR_ENOSPC_sb; + mutex_unlock(&c->sb_lock); + goto err; + } + + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { + ext->recovery_passes_required[0] |= + cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); + write_sb = true; + } + + u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (sb_passes) { + struct printbuf buf = PRINTBUF; + prt_str(&buf, "superblock requires following recovery passes to be run:\n "); + prt_bitflags(&buf, bch2_recovery_passes, sb_passes); + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + if (bch2_check_version_downgrade(c)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Version downgrade required:\n"); + + __le64 passes = ext->recovery_passes_required[0]; + bch2_sb_set_downgrade(c, + BCH_VERSION_MINOR(bcachefs_metadata_version_current), + BCH_VERSION_MINOR(c->sb.version)); + passes = ext->recovery_passes_required[0] & ~passes; + if (passes) { + prt_str(&buf, " running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); + } + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + write_sb = true; + } + + if (check_version_upgrade(c)) + write_sb = true; + + if (write_sb) + bch2_write_super(c); + + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + mutex_unlock(&c->sb_lock); + } + + if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + ret = bch2_blacklist_table_initialize(c); if (ret) { bch_err(c, "error initializing blacklist table"); @@ -822,11 +915,6 @@ use_clean: if (ret) goto err; - if (c->opts.fsck && - (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || - BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))) - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); - ret = bch2_run_recovery_passes(c); if (ret) goto err; @@ -863,16 +951,30 @@ use_clean: } mutex_lock(&c->sb_lock); - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) { - SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version); + bool write_sb = false; + + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags)) { + if (!test_bit(BCH_FS_ERROR, &c->flags) && + !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); write_sb = true; } + if (!test_bit(BCH_FS_ERROR, &c->flags)) { + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + if (ext && + (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { + memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); + write_sb = true; + } + } + if (c->opts.fsck && !test_bit(BCH_FS_ERROR, &c->flags) && !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { @@ -909,10 +1011,8 @@ out: bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) { - bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(c); - } + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + bch2_journal_keys_put_initial(c); kfree(clean); if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { @@ -944,7 +1044,7 @@ int bch2_fs_initialize(struct bch_fs *c) c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - bch2_sb_maybe_downgrade(c); + bch2_check_version_downgrade(c); if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { bch2_sb_upgrade(c, bcachefs_metadata_version_current); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 852d30567da9..3a554b0751d0 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -4,12 +4,18 @@ extern const char * const bch2_recovery_passes[]; +u64 bch2_recovery_passes_to_stable(u64 v); +u64 bch2_recovery_passes_from_stable(u64 v); + /* * For when we need to rewind recovery passes and run a pass we skipped: */ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return 0; + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", bch2_recovery_passes[pass], pass, bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index 515e3d62c2ac..d37c6fd30e38 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -7,45 +7,57 @@ #define PASS_UNCLEAN BIT(2) #define PASS_ALWAYS BIT(3) -#define BCH_RECOVERY_PASSES() \ - x(alloc_read, PASS_ALWAYS) \ - x(stripes_read, PASS_ALWAYS) \ - x(initialize_subvolumes, 0) \ - x(snapshots_read, PASS_ALWAYS) \ - x(check_topology, 0) \ - x(check_allocations, PASS_FSCK) \ - x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, PASS_ALWAYS) \ - x(check_alloc_info, PASS_FSCK) \ - x(check_lrus, PASS_FSCK) \ - x(check_btree_backpointers, PASS_FSCK) \ - x(check_backpointers_to_extents,PASS_FSCK) \ - x(check_extents_to_backpointers,PASS_FSCK) \ - x(check_alloc_to_lru_refs, PASS_FSCK) \ - x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 0) \ - x(check_snapshot_trees, PASS_FSCK) \ - x(check_snapshots, PASS_FSCK) \ - x(check_subvols, PASS_FSCK) \ - x(delete_dead_snapshots, PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 0) \ - x(resume_logged_ops, PASS_ALWAYS) \ - x(check_inodes, PASS_FSCK) \ - x(check_extents, PASS_FSCK) \ - x(check_indirect_extents, PASS_FSCK) \ - x(check_dirents, PASS_FSCK) \ - x(check_xattrs, PASS_FSCK) \ - x(check_root, PASS_FSCK) \ - x(check_directory_structure, PASS_FSCK) \ - x(check_nlinks, PASS_FSCK) \ - x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ - x(fix_reflink_p, 0) \ - x(set_fs_needs_rebalance, 0) \ +/* + * Passes may be reordered, but the second field is a persistent identifier and + * must never change: + */ +#define BCH_RECOVERY_PASSES() \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, PASS_ALWAYS) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_topology, 4, 0) \ + x(check_allocations, 5, PASS_FSCK) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_FSCK) \ + x(check_lrus, 11, PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_FSCK) \ + x(check_extents_to_backpointers, 14, PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_FSCK) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(check_snapshot_trees, 18, PASS_FSCK) \ + x(check_snapshots, 19, PASS_FSCK) \ + x(check_subvols, 20, PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_FSCK) \ + x(check_directory_structure, 30, PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) \ +/* We normally enumerate recovery passes in the order we run them: */ enum bch_recovery_pass { -#define x(n, when) BCH_RECOVERY_PASS_##n, +#define x(n, id, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +/* But we also need stable identifiers that can be used in the superblock */ +enum bch_recovery_pass_stable { +#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, BCH_RECOVERY_PASSES() #undef x }; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 6e1bfe9feb59..37d16e04e671 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -121,6 +121,14 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans, { check_indirect_extent_deleting(new, &flags); + if (old.k->type == KEY_TYPE_reflink_v && + new->k.type == KEY_TYPE_reflink_v && + old.k->u64s == new->k.u64s && + !memcmp(bkey_s_c_to_reflink_v(old).v->start, + bkey_i_to_reflink_v(new)->v.start, + bkey_val_bytes(&new->k) - 8)) + return 0; + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); } diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 1c3ae13bfced..2008fe8bf706 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -68,6 +68,33 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } +int bch2_replicas_entry_validate(struct bch_replicas_entry *r, + struct bch_sb *sb, + struct printbuf *err) +{ + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (!bch2_dev_exists(sb, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; +} + void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { @@ -163,7 +190,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, } static struct bch_replicas_cpu -cpu_replicas_add_entry(struct bch_replicas_cpu *old, +cpu_replicas_add_entry(struct bch_fs *c, + struct bch_replicas_cpu *old, struct bch_replicas_entry *new_entry) { unsigned i; @@ -173,6 +201,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, replicas_entry_bytes(new_entry)), }; + for (i = 0; i < new_entry->nr_devs; i++) + BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); + BUG_ON(!new_entry->data_type); verify_replicas_entry(new_entry); @@ -382,7 +413,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, if (c->replicas_gc.entries && !__replicas_has_entry(&c->replicas_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); + new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); if (!new_gc.entries) { ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; @@ -390,7 +421,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, } if (!__replicas_has_entry(&c->replicas, new_entry)) { - new_r = cpu_replicas_add_entry(&c->replicas, new_entry); + new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); if (!new_r.entries) { ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; @@ -598,7 +629,7 @@ int bch2_replicas_set_usage(struct bch_fs *c, if (idx < 0) { struct bch_replicas_cpu n; - n = cpu_replicas_add_entry(&c->replicas, r); + n = cpu_replicas_add_entry(c, &c->replicas, r); if (!n.entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -797,7 +828,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_sb *sb, struct printbuf *err) { - unsigned i, j; + unsigned i; sort_cmp_size(cpu_r->entries, cpu_r->nr, @@ -808,31 +839,9 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry *e = cpu_replicas_entry(cpu_r, i); - if (e->data_type >= BCH_DATA_NR) { - prt_printf(err, "invalid data type in entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - - if (!e->nr_devs) { - prt_printf(err, "no devices in entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - - if (e->nr_required > 1 && - e->nr_required >= e->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - - for (j = 0; j < e->nr_devs; j++) - if (!bch2_dev_exists(sb, e->devs[j])) { - prt_printf(err, "invalid device %u in entry ", e->devs[j]); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } + int ret = bch2_replicas_entry_validate(e, sb, err); + if (ret) + return ret; if (i + 1 < cpu_r->nr) { struct bch_replicas_entry *n = diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 4887675a86f0..f70a642775d1 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -9,6 +9,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry *); +int bch2_replicas_entry_validate(struct bch_replicas_entry *, + struct bch_sb *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); static inline struct bch_replicas_entry * diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index e151ada1c8bd..c76ad8ea5e4a 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -332,8 +332,6 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_sb_maybe_downgrade(c); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); ret = bch2_write_super(c); diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c new file mode 100644 index 000000000000..4919237bbe73 --- /dev/null +++ b/fs/bcachefs/sb-downgrade.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Superblock section that contains a list of recovery passes to run when + * downgrading past a given version + */ + +#include "bcachefs.h" +#include "darray.h" +#include "recovery.h" +#include "sb-downgrade.h" +#include "sb-errors.h" +#include "super-io.h" + +/* + * Downgrade table: + * When dowgrading past certain versions, we need to run certain recovery passes + * and fix certain errors: + * + * x(version, recovery_passes, errors...) + */ + +#define DOWNGRADE_TABLE() + +struct downgrade_entry { + u64 recovery_passes; + u16 version; + u16 nr_errors; + const u16 *errors; +}; + +#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ }; +DOWNGRADE_TABLE() +#undef x + +static const struct downgrade_entry downgrade_table[] = { +#define x(ver, passes, ...) { \ + .recovery_passes = passes, \ + .version = bcachefs_metadata_version_##ver,\ + .nr_errors = ARRAY_SIZE(ver_##errors), \ + .errors = ver_##errors, \ +}, +DOWNGRADE_TABLE() +#undef x +}; + +static inline const struct bch_sb_field_downgrade_entry * +downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e) +{ + return (void *) &e->errors[le16_to_cpu(e->nr_errors)]; +} + +#define for_each_downgrade_entry(_d, _i) \ + for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \ + (void *) _i < vstruct_end(&(_d)->field) && \ + (void *) &_i->errors[0] < vstruct_end(&(_d)->field); \ + _i = downgrade_entry_next_c(_i)) + +static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); + + for_each_downgrade_entry(e, i) { + if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) != + BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) { + prt_printf(err, "downgrade entry with mismatched major version (%u != %u)", + BCH_VERSION_MAJOR(le16_to_cpu(i->version)), + BCH_VERSION_MAJOR(le16_to_cpu(sb->version))); + return -BCH_ERR_invalid_sb_downgrade; + } + } + + return 0; +} + +static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_downgrade *e = field_to_type(f, downgrade); + + if (out->nr_tabstops <= 1) + printbuf_tabstop_push(out, 16); + + for_each_downgrade_entry(e, i) { + prt_str(out, "version:"); + prt_tab(out); + bch2_version_to_text(out, le16_to_cpu(i->version)); + prt_newline(out); + + prt_str(out, "recovery passes:"); + prt_tab(out); + prt_bitflags(out, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0]))); + prt_newline(out); + + prt_str(out, "errors:"); + prt_tab(out); + bool first = true; + for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { + if (!first) + prt_char(out, ','); + first = false; + unsigned e = le16_to_cpu(i->errors[j]); + prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)"); + } + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_downgrade = { + .validate = bch2_sb_downgrade_validate, + .to_text = bch2_sb_downgrade_to_text, +}; + +int bch2_sb_downgrade_update(struct bch_fs *c) +{ + darray_char table = {}; + int ret = 0; + + for (const struct downgrade_entry *src = downgrade_table; + src < downgrade_table + ARRAY_SIZE(downgrade_table); + src++) { + if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) + continue; + + struct bch_sb_field_downgrade_entry *dst; + unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; + + ret = darray_make_room(&table, bytes); + if (ret) + goto out; + + dst = (void *) &darray_top(table); + dst->version = cpu_to_le16(src->version); + dst->recovery_passes[0] = cpu_to_le64(src->recovery_passes); + dst->recovery_passes[1] = 0; + dst->nr_errors = cpu_to_le16(src->nr_errors); + for (unsigned i = 0; i < src->nr_errors; i++) + dst->errors[i] = cpu_to_le16(src->errors[i]); + + table.nr += bytes; + } + + struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); + + unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64)); + + if (d && le32_to_cpu(d->field.u64s) > sb_u64s) + goto out; + + d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); + if (!d) { + ret = -BCH_ERR_ENOSPC_sb_downgrade; + goto out; + } + + memcpy(d->entries, table.data, table.nr); + memset_u64s_tail(d->entries, 0, table.nr); +out: + darray_exit(&table); + return ret; +} + +void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor) +{ + struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade); + if (!d) + return; + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + for_each_downgrade_entry(d, i) { + unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version)); + if (new_minor < minor && minor <= old_minor) { + ext->recovery_passes_required[0] |= i->recovery_passes[0]; + ext->recovery_passes_required[1] |= i->recovery_passes[1]; + + for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { + unsigned e = le16_to_cpu(i->errors[j]); + if (e < BCH_SB_ERR_MAX) + __set_bit(e, c->sb.errors_silent); + if (e < sizeof(ext->errors_silent) * 8) + ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64)); + } + } + } +} diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h new file mode 100644 index 000000000000..bc48fd2ca70e --- /dev/null +++ b/fs/bcachefs/sb-downgrade.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_DOWNGRADE_H +#define _BCACHEFS_SB_DOWNGRADE_H + +extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade; + +int bch2_sb_downgrade_update(struct bch_fs *); +void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned); + +#endif /* _BCACHEFS_SB_DOWNGRADE_H */ diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index f0930ab7f036..5f5bcae391fb 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -4,7 +4,7 @@ #include "sb-errors.h" #include "super-io.h" -static const char * const bch2_sb_error_strs[] = { +const char * const bch2_sb_error_strs[] = { #define x(t, n, ...) [n] = #t, BCH_SB_ERRS() NULL @@ -20,9 +20,7 @@ static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) { - return e - ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0]) - : 0; + return bch2_sb_field_nr_entries(e); } static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index 5a09a53966be..8889001e7db4 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -4,258 +4,7 @@ #include "sb-errors_types.h" -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 50) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_pos_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(dirent_d_parent_subvol_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) - -enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x - BCH_SB_ERR_MAX -}; +extern const char * const bch2_sb_error_strs[]; extern const struct bch_sb_field_ops bch_sb_field_ops_errors; diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index b1c099843a39..3504c2d09c29 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -4,6 +4,259 @@ #include "darray.h" +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0) \ + x(dirty_but_no_journal_entries, 1) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ + x(sb_clean_journal_seq_mismatch, 3) \ + x(sb_clean_btree_root_mismatch, 4) \ + x(sb_clean_missing, 5) \ + x(jset_unsupported_version, 6) \ + x(jset_unknown_csum, 7) \ + x(jset_last_seq_newer_than_seq, 8) \ + x(jset_past_bucket_end, 9) \ + x(jset_seq_blacklisted, 10) \ + x(journal_entries_missing, 11) \ + x(journal_entry_replicas_not_marked, 12) \ + x(journal_entry_past_jset_end, 13) \ + x(journal_entry_replicas_data_mismatch, 14) \ + x(journal_entry_bkey_u64s_0, 15) \ + x(journal_entry_bkey_past_end, 16) \ + x(journal_entry_bkey_bad_format, 17) \ + x(journal_entry_bkey_invalid, 18) \ + x(journal_entry_btree_root_bad_size, 19) \ + x(journal_entry_blacklist_bad_size, 20) \ + x(journal_entry_blacklist_v2_bad_size, 21) \ + x(journal_entry_blacklist_v2_start_past_end, 22) \ + x(journal_entry_usage_bad_size, 23) \ + x(journal_entry_data_usage_bad_size, 24) \ + x(journal_entry_clock_bad_size, 25) \ + x(journal_entry_clock_bad_rw, 26) \ + x(journal_entry_dev_usage_bad_size, 27) \ + x(journal_entry_dev_usage_bad_dev, 28) \ + x(journal_entry_dev_usage_bad_pad, 29) \ + x(btree_node_unreadable, 30) \ + x(btree_node_fault_injected, 31) \ + x(btree_node_bad_magic, 32) \ + x(btree_node_bad_seq, 33) \ + x(btree_node_unsupported_version, 34) \ + x(btree_node_bset_older_than_sb_min, 35) \ + x(btree_node_bset_newer_than_sb, 36) \ + x(btree_node_data_missing, 37) \ + x(btree_node_bset_after_end, 38) \ + x(btree_node_replicas_sectors_written_mismatch, 39) \ + x(btree_node_replicas_data_mismatch, 40) \ + x(bset_unknown_csum, 41) \ + x(bset_bad_csum, 42) \ + x(bset_past_end_of_btree_node, 43) \ + x(bset_wrong_sector_offset, 44) \ + x(bset_empty, 45) \ + x(bset_bad_seq, 46) \ + x(bset_blacklisted_journal_seq, 47) \ + x(first_bset_blacklisted_journal_seq, 48) \ + x(btree_node_bad_btree, 49) \ + x(btree_node_bad_level, 50) \ + x(btree_node_bad_min_key, 51) \ + x(btree_node_bad_max_key, 52) \ + x(btree_node_bad_format, 53) \ + x(btree_node_bkey_past_bset_end, 54) \ + x(btree_node_bkey_bad_format, 55) \ + x(btree_node_bad_bkey, 56) \ + x(btree_node_bkey_out_of_order, 57) \ + x(btree_root_bkey_invalid, 58) \ + x(btree_root_read_error, 59) \ + x(btree_root_bad_min_key, 60) \ + x(btree_root_bad_max_key, 61) \ + x(btree_node_read_error, 62) \ + x(btree_node_topology_bad_min_key, 63) \ + x(btree_node_topology_bad_max_key, 64) \ + x(btree_node_topology_overwritten_by_prev_node, 65) \ + x(btree_node_topology_overwritten_by_next_node, 66) \ + x(btree_node_topology_interior_node_empty, 67) \ + x(fs_usage_hidden_wrong, 68) \ + x(fs_usage_btree_wrong, 69) \ + x(fs_usage_data_wrong, 70) \ + x(fs_usage_cached_wrong, 71) \ + x(fs_usage_reserved_wrong, 72) \ + x(fs_usage_persistent_reserved_wrong, 73) \ + x(fs_usage_nr_inodes_wrong, 74) \ + x(fs_usage_replicas_wrong, 75) \ + x(dev_usage_buckets_wrong, 76) \ + x(dev_usage_sectors_wrong, 77) \ + x(dev_usage_fragmented_wrong, 78) \ + x(dev_usage_buckets_ec_wrong, 79) \ + x(bkey_version_in_future, 80) \ + x(bkey_u64s_too_small, 81) \ + x(bkey_invalid_type_for_btree, 82) \ + x(bkey_extent_size_zero, 83) \ + x(bkey_extent_size_greater_than_offset, 84) \ + x(bkey_size_nonzero, 85) \ + x(bkey_snapshot_nonzero, 86) \ + x(bkey_snapshot_zero, 87) \ + x(bkey_at_pos_max, 88) \ + x(bkey_before_start_of_btree_node, 89) \ + x(bkey_after_end_of_btree_node, 90) \ + x(bkey_val_size_nonzero, 91) \ + x(bkey_val_size_too_small, 92) \ + x(alloc_v1_val_size_bad, 93) \ + x(alloc_v2_unpack_error, 94) \ + x(alloc_v3_unpack_error, 95) \ + x(alloc_v4_val_size_bad, 96) \ + x(alloc_v4_backpointers_start_bad, 97) \ + x(alloc_key_data_type_bad, 98) \ + x(alloc_key_empty_but_have_data, 99) \ + x(alloc_key_dirty_sectors_0, 100) \ + x(alloc_key_data_type_inconsistency, 101) \ + x(alloc_key_to_missing_dev_bucket, 102) \ + x(alloc_key_cached_inconsistency, 103) \ + x(alloc_key_cached_but_read_time_zero, 104) \ + x(alloc_key_to_missing_lru_entry, 105) \ + x(alloc_key_data_type_wrong, 106) \ + x(alloc_key_gen_wrong, 107) \ + x(alloc_key_dirty_sectors_wrong, 108) \ + x(alloc_key_cached_sectors_wrong, 109) \ + x(alloc_key_stripe_wrong, 110) \ + x(alloc_key_stripe_redundancy_wrong, 111) \ + x(bucket_sector_count_overflow, 112) \ + x(bucket_metadata_type_mismatch, 113) \ + x(need_discard_key_wrong, 114) \ + x(freespace_key_wrong, 115) \ + x(freespace_hole_missing, 116) \ + x(bucket_gens_val_size_bad, 117) \ + x(bucket_gens_key_wrong, 118) \ + x(bucket_gens_hole_wrong, 119) \ + x(bucket_gens_to_invalid_dev, 120) \ + x(bucket_gens_to_invalid_buckets, 121) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ + x(need_discard_freespace_key_bad, 124) \ + x(backpointer_pos_wrong, 125) \ + x(backpointer_to_missing_device, 126) \ + x(backpointer_to_missing_alloc, 127) \ + x(backpointer_to_missing_ptr, 128) \ + x(lru_entry_at_time_0, 129) \ + x(lru_entry_to_invalid_bucket, 130) \ + x(lru_entry_bad, 131) \ + x(btree_ptr_val_too_big, 132) \ + x(btree_ptr_v2_val_too_big, 133) \ + x(btree_ptr_has_non_ptr, 134) \ + x(extent_ptrs_invalid_entry, 135) \ + x(extent_ptrs_no_ptrs, 136) \ + x(extent_ptrs_too_many_ptrs, 137) \ + x(extent_ptrs_redundant_crc, 138) \ + x(extent_ptrs_redundant_stripe, 139) \ + x(extent_ptrs_unwritten, 140) \ + x(extent_ptrs_written_and_unwritten, 141) \ + x(ptr_to_invalid_device, 142) \ + x(ptr_to_duplicate_device, 143) \ + x(ptr_after_last_bucket, 144) \ + x(ptr_before_first_bucket, 145) \ + x(ptr_spans_multiple_buckets, 146) \ + x(ptr_to_missing_backpointer, 147) \ + x(ptr_to_missing_alloc_key, 148) \ + x(ptr_to_missing_replicas_entry, 149) \ + x(ptr_to_missing_stripe, 150) \ + x(ptr_to_incorrect_stripe, 151) \ + x(ptr_gen_newer_than_bucket_gen, 152) \ + x(ptr_too_stale, 153) \ + x(stale_dirty_ptr, 154) \ + x(ptr_bucket_data_type_mismatch, 155) \ + x(ptr_cached_and_erasure_coded, 156) \ + x(ptr_crc_uncompressed_size_too_small, 157) \ + x(ptr_crc_csum_type_unknown, 158) \ + x(ptr_crc_compression_type_unknown, 159) \ + x(ptr_crc_redundant, 160) \ + x(ptr_crc_uncompressed_size_too_big, 161) \ + x(ptr_crc_nonce_mismatch, 162) \ + x(ptr_stripe_redundant, 163) \ + x(reservation_key_nr_replicas_invalid, 164) \ + x(reflink_v_refcount_wrong, 165) \ + x(reflink_p_to_missing_reflink_v, 166) \ + x(stripe_pos_bad, 167) \ + x(stripe_val_size_bad, 168) \ + x(stripe_sector_count_wrong, 169) \ + x(snapshot_tree_pos_bad, 170) \ + x(snapshot_tree_to_missing_snapshot, 171) \ + x(snapshot_tree_to_missing_subvol, 172) \ + x(snapshot_tree_to_wrong_subvol, 173) \ + x(snapshot_tree_to_snapshot_subvol, 174) \ + x(snapshot_pos_bad, 175) \ + x(snapshot_parent_bad, 176) \ + x(snapshot_children_not_normalized, 177) \ + x(snapshot_child_duplicate, 178) \ + x(snapshot_child_bad, 179) \ + x(snapshot_skiplist_not_normalized, 180) \ + x(snapshot_skiplist_bad, 181) \ + x(snapshot_should_not_have_subvol, 182) \ + x(snapshot_to_bad_snapshot_tree, 183) \ + x(snapshot_bad_depth, 184) \ + x(snapshot_bad_skiplist, 185) \ + x(subvol_pos_bad, 186) \ + x(subvol_not_master_and_not_snapshot, 187) \ + x(subvol_to_missing_root, 188) \ + x(subvol_root_wrong_bi_subvol, 189) \ + x(bkey_in_missing_snapshot, 190) \ + x(inode_pos_inode_nonzero, 191) \ + x(inode_pos_blockdev_range, 192) \ + x(inode_unpack_error, 193) \ + x(inode_str_hash_invalid, 194) \ + x(inode_v3_fields_start_bad, 195) \ + x(inode_snapshot_mismatch, 196) \ + x(inode_unlinked_but_clean, 197) \ + x(inode_unlinked_but_nlink_nonzero, 198) \ + x(inode_checksum_type_invalid, 199) \ + x(inode_compression_type_invalid, 200) \ + x(inode_subvol_root_but_not_dir, 201) \ + x(inode_i_size_dirty_but_clean, 202) \ + x(inode_i_sectors_dirty_but_clean, 203) \ + x(inode_i_sectors_wrong, 204) \ + x(inode_dir_wrong_nlink, 205) \ + x(inode_dir_multiple_links, 206) \ + x(inode_multiple_links_but_nlink_0, 207) \ + x(inode_wrong_backpointer, 208) \ + x(inode_wrong_nlink, 209) \ + x(inode_unreachable, 210) \ + x(deleted_inode_but_clean, 211) \ + x(deleted_inode_missing, 212) \ + x(deleted_inode_is_dir, 213) \ + x(deleted_inode_not_unlinked, 214) \ + x(extent_overlapping, 215) \ + x(extent_in_missing_inode, 216) \ + x(extent_in_non_reg_inode, 217) \ + x(extent_past_end_of_inode, 218) \ + x(dirent_empty_name, 219) \ + x(dirent_val_too_big, 220) \ + x(dirent_name_too_long, 221) \ + x(dirent_name_embedded_nul, 222) \ + x(dirent_name_dot_or_dotdot, 223) \ + x(dirent_name_has_slash, 224) \ + x(dirent_d_type_wrong, 225) \ + x(dirent_d_parent_subvol_wrong, 226) \ + x(dirent_in_missing_dir_inode, 227) \ + x(dirent_in_non_dir_inode, 228) \ + x(dirent_to_missing_inode, 229) \ + x(dirent_to_missing_subvol, 230) \ + x(dirent_to_itself, 231) \ + x(quota_type_invalid, 232) \ + x(xattr_val_size_too_small, 233) \ + x(xattr_val_size_too_big, 234) \ + x(xattr_invalid_type, 235) \ + x(xattr_name_invalid_chars, 236) \ + x(xattr_in_missing_inode, 237) \ + x(root_subvol_missing, 238) \ + x(root_dir_missing, 239) \ + x(root_inode_not_dir, 240) \ + x(dir_loop, 241) \ + x(hash_table_key_duplicate, 242) \ + x(hash_table_key_wrong_offset, 243) + +enum bch_sb_error_id { +#define x(t, n) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + struct bch_sb_error_entry_cpu { u64 id:16, nr:48; diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index b775cf0fb7cb..97790445e67a 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, this_cpu_sub(*lock->readers, !ret); preempt_enable(); - if (!ret && (old & SIX_LOCK_WAITING_write)) - ret = -1 - SIX_LOCK_write; + if (!ret) { + smp_mb(); + if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) + ret = -1 - SIX_LOCK_write; + } } else if (type == SIX_LOCK_write && lock->readers) { if (try) { atomic_add(SIX_LOCK_HELD_write, &lock->state); diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index e9af77b384c7..5dac038f0851 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -959,7 +959,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) parent_id, id)) goto err; - parent->v.children[i] = le32_to_cpu(child_id); + parent->v.children[i] = cpu_to_le32(child_id); normalize_snapshot_child_pointers(&parent->v); } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index fccd25aa3242..22b34a8e4d6e 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -146,6 +146,24 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); } +int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) +{ + struct bch_subvolume s; + int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s); + if (ret) + return ret; + + if (BCH_SUBVOLUME_RO(&s)) + return -EROFS; + return 0; +} + +int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) +{ + return bch2_trans_do(c, NULL, NULL, 0, + bch2_subvol_is_ro_trans(trans, subvol)); +} + int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, struct bch_subvolume *subvol) { diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index a1003d30ab0a..a6f56f66e27c 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -23,6 +23,9 @@ int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); +int bch2_subvol_is_ro_trans(struct btree_trans *, u32); +int bch2_subvol_is_ro(struct bch_fs *, u32); + int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 86833445af20..2d2e66a4e468 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,7 +20,7 @@ struct snapshot_t { }; struct snapshot_table { - struct snapshot_t s[0]; + DECLARE_FLEX_ARRAY(struct snapshot_t, s); }; typedef struct { diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f4cad903f4d6..78013deda9df 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -13,6 +13,7 @@ #include "replicas.h" #include "quota.h" #include "sb-clean.h" +#include "sb-downgrade.h" #include "sb-errors.h" #include "sb-members.h" #include "super-io.h" @@ -163,9 +164,10 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb, void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->holder); + if (!IS_ERR_OR_NULL(sb->bdev_handle)) + bdev_release(sb->bdev_handle); kfree(sb->holder); + kfree(sb->sb_name); kfree(sb->sb); memset(sb, 0, sizeof(*sb)); @@ -263,6 +265,17 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, return f; } +struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *sb, + enum bch_sb_field_type type, + unsigned u64s) +{ + struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); + + if (!f || le32_to_cpu(f->u64s) < u64s) + f = bch2_sb_field_resize_id(sb, type, u64s); + return f; +} + /* Superblock validate: */ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) @@ -483,6 +496,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, /* device open: */ +static unsigned long le_ulong_to_cpu(unsigned long v) +{ + return sizeof(unsigned long) == 8 + ? le64_to_cpu(v) + : le32_to_cpu(v); +} + +static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned nr) +{ + BUG_ON(nr & (BITS_PER_TYPE(long) - 1)); + + for (unsigned i = 0; i < BITS_TO_LONGS(nr); i++) + dst[i] = le_ulong_to_cpu(src[i]); +} + static void bch2_sb_update(struct bch_fs *c) { struct bch_sb *src = c->disk_sb.sb; @@ -511,8 +539,15 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.features = le64_to_cpu(src->features[0]); c->sb.compat = le64_to_cpu(src->compat[0]); + memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); + if (ext) + le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, + sizeof(c->sb.errors_silent) * 8); + for_each_member_device(ca, c, i) { - struct bch_member m = bch2_sb_member_get(src, i); + struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); ca->mi = bch2_mi_to_cpu(&m); } } @@ -675,6 +710,10 @@ retry: if (!sb->holder) return -ENOMEM; + sb->sb_name = kstrdup(path, GFP_KERNEL); + if (!sb->sb_name) + return -ENOMEM; + #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) sb->mode |= BLK_OPEN_BUFFERED; @@ -686,21 +725,22 @@ retry: if (!opt_get(*opts, nochanges)) sb->mode |= BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (IS_ERR(sb->bdev) && - PTR_ERR(sb->bdev) == -EACCES && + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->bdev_handle) && + PTR_ERR(sb->bdev_handle) == -EACCES && opt_get(*opts, read_only)) { sb->mode &= ~BLK_OPEN_WRITE; - sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); - if (!IS_ERR(sb->bdev)) + sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->bdev_handle)) opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev)) { - ret = PTR_ERR(sb->bdev); + if (IS_ERR(sb->bdev_handle)) { + ret = PTR_ERR(sb->bdev_handle); goto out; } + sb->bdev = sb->bdev_handle->bdev; ret = bch2_sb_realloc(sb, 0); if (ret) { @@ -901,6 +941,7 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_members_from_cpu(c); bch2_sb_members_cpy_v2_v1(&c->disk_sb); bch2_sb_errors_from_cpu(c); + bch2_sb_downgrade_update(c); for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); @@ -1024,8 +1065,10 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) } /* Downgrade if superblock is at a higher version than currently supported: */ -void bch2_sb_maybe_downgrade(struct bch_fs *c) +bool bch2_check_version_downgrade(struct bch_fs *c) { + bool ret = bcachefs_metadata_version_current < c->sb.version; + lockdep_assert_held(&c->sb_lock); /* @@ -1039,16 +1082,61 @@ void bch2_sb_maybe_downgrade(struct bch_fs *c) if (c->sb.version_min > bcachefs_metadata_version_current) c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); + return ret; } void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) { lockdep_assert_held(&c->sb_lock); + if (BCH_VERSION_MAJOR(new_version) > + BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) + bch2_sb_field_resize(&c->disk_sb, downgrade, 0); + c->disk_sb.sb->version = cpu_to_le16(new_version); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); } +static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + if (vstruct_bytes(f) < 88) { + prt_printf(err, "field too small (%zu < %u)", vstruct_bytes(f), 88); + return -BCH_ERR_invalid_sb_ext; + } + + return 0; +} + +static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_ext *e = field_to_type(f, ext); + + prt_printf(out, "Recovery passes required:"); + prt_tab(out); + prt_bitflags(out, bch2_recovery_passes, + bch2_recovery_passes_from_stable(le64_to_cpu(e->recovery_passes_required[0]))); + prt_newline(out); + + unsigned long *errors_silent = kmalloc(sizeof(e->errors_silent), GFP_KERNEL); + if (errors_silent) { + le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); + + prt_printf(out, "Errors to silently fix:"); + prt_tab(out); + prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); + prt_newline(out); + + kfree(errors_silent); + } +} + +static const struct bch_sb_field_ops bch_sb_field_ops_ext = { + .validate = bch2_sb_ext_validate, + .to_text = bch2_sb_ext_to_text, +}; + static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #define x(f, nr) \ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index f5abd102bff7..e41e5de531a0 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -40,6 +40,16 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *, #define bch2_sb_field_resize(_sb, _name, _u64s) \ field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) +struct bch_sb_field *bch2_sb_field_get_minsize_id(struct bch_sb_handle *, + enum bch_sb_field_type, unsigned); +#define bch2_sb_field_get_minsize(_sb, _name, _u64s) \ + field_to_type(bch2_sb_field_get_minsize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) + +#define bch2_sb_field_nr_entries(_f) \ + (_f ? ((bch2_sb_field_bytes(&_f->field) - sizeof(*_f)) / \ + sizeof(_f->entries[0])) \ + : 0) + void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); extern const char * const bch2_sb_fields[]; @@ -83,7 +93,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) __bch2_check_set_feature(c, feat); } -void bch2_sb_maybe_downgrade(struct bch_fs *); +bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned); void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 24672bb31cbe..818ec467a06b 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -72,6 +72,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); +MODULE_SOFTDEP("pre: crc32c"); +MODULE_SOFTDEP("pre: crc64"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: chacha20"); +MODULE_SOFTDEP("pre: poly1305"); +MODULE_SOFTDEP("pre: xxhash"); #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ @@ -423,6 +429,18 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + set_bit(BCH_FS_RW, &c->flags); + set_bit(BCH_FS_WAS_RW, &c->flags); + +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_reinit(&c->writes); +#else + for (i = 0; i < BCH_WRITE_REF_NR; i++) { + BUG_ON(atomic_long_read(&c->writes[i])); + atomic_long_inc(&c->writes[i]); + } +#endif + ret = bch2_gc_thread_start(c); if (ret) { bch_err(c, "error starting gc thread"); @@ -439,24 +457,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) goto err; } -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_reinit(&c->writes); -#else - for (i = 0; i < BCH_WRITE_REF_NR; i++) { - BUG_ON(atomic_long_read(&c->writes[i])); - atomic_long_inc(&c->writes[i]); - } -#endif - set_bit(BCH_FS_RW, &c->flags); - set_bit(BCH_FS_WAS_RW, &c->flags); - bch2_do_discards(c); bch2_do_invalidates(c); bch2_do_stripe_deletes(c); bch2_do_pending_node_rewrites(c); return 0; err: - __bch2_fs_read_only(c); + if (test_bit(BCH_FS_RW, &c->flags)) + bch2_fs_read_only(c); + else + __bch2_fs_read_only(c); return ret; } @@ -504,8 +514,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(c); + bch2_journal_keys_put_initial(c); + BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); free_percpu(c->online_reserved); @@ -702,12 +712,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); + atomic_set(&c->journal_keys.ref, 1); + c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 7dda4985b99f..b2119686e2e1 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -4,7 +4,9 @@ struct bch_sb_handle { struct bch_sb *sb; + struct bdev_handle *bdev_handle; struct block_device *bdev; + char *sb_name; struct bio *bio; void *holder; size_t buffer_size; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index ab743115f169..f3cb7115b530 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -276,8 +276,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (!btree_type_has_ptrs(id)) continue; - for_each_btree_key(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = for_each_btree_key2(trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -309,8 +309,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c nr_uncompressed_extents++; else if (compressed) nr_compressed_extents++; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } bch2_trans_put(trans); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 893304a1f06e..fd49b63562c3 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write, TRACE_EVENT(journal_reclaim_start, TP_PROTO(struct bch_fs *c, bool direct, bool kicked, u64 min_nr, u64 min_key_cache, - u64 prereserved, u64 prereserved_total, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), @@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start, __field(bool, kicked ) __field(u64, min_nr ) __field(u64, min_key_cache ) - __field(u64, prereserved ) - __field(u64, prereserved_total ) __field(u64, btree_cache_dirty ) __field(u64, btree_cache_total ) __field(u64, btree_key_cache_dirty ) @@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start, __entry->kicked = kicked; __entry->min_nr = min_nr; __entry->min_key_cache = min_key_cache; - __entry->prereserved = prereserved; - __entry->prereserved_total = prereserved_total; __entry->btree_cache_dirty = btree_cache_dirty; __entry->btree_cache_total = btree_cache_total; __entry->btree_key_cache_dirty = btree_key_cache_dirty; __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->direct, __entry->kicked, __entry->min_nr, __entry->min_key_cache, - __entry->prereserved, - __entry->prereserved_total, __entry->btree_cache_dirty, __entry->btree_cache_total, __entry->btree_key_cache_dirty, @@ -761,9 +754,9 @@ TRACE_EVENT(move_extent_fail, TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) ); -DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(bkey, move_extent_start_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(move_data, diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 2984b57b2958..b701f7fe0784 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -243,6 +243,7 @@ do { \ #define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) #define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) #define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) +#define prt_bitflags_vector(...) bch2_prt_bitflags_vector(__VA_ARGS__) void bch2_pr_time_units(struct printbuf *, u64); void bch2_prt_datetime(struct printbuf *, time64_t); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index a39ff0c296ec..5a1858fb9879 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -176,7 +176,8 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, struct btree_iter inode_iter = { NULL }; int ret; - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: + bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); if (ret) return ret; @@ -552,6 +553,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, s.v = v + 1; s.defined = true; } else { + /* + * Check if this option was set on the parent - if so, switched + * back to inheriting from the parent: + * + * rename() also has to deal with keeping inherited options up + * to date - see bch2_reinherit_attrs() + */ + spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { struct bch_inode_info *dir = to_bch_ei(d_inode(dentry->d_parent)); @@ -560,6 +569,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } else { s.v = 0; } + spin_unlock(&dentry->d_lock); s.defined = false; } |