diff options
Diffstat (limited to 'fs')
246 files changed, 8029 insertions, 4967 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index d525957594b6..61dbe52bb3a3 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -732,4 +732,5 @@ module_exit(exit_v9fs) MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>"); MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>"); +MODULE_DESCRIPTION("9P Client File System"); MODULE_LICENSE("GPL"); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index cdf441f22e07..731e3d14b67d 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -52,7 +52,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); -void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 053d1cef6e13..8604e3377ee7 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -68,7 +68,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, struct p9_fid *fid; int ret; - p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n", + p9_debug(P9_DEBUG_VFS, "name = '%s' value_len = %zu\n", name, buffer_size); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) @@ -139,7 +139,8 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); + /* Txattrwalk with an empty string lists xattrs instead */ + return v9fs_xattr_get(dentry, "", buffer, buffer_size); } static int v9fs_xattr_handler_get(const struct xattr_handler *handler, diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 2fe4a5832fcf..d6b9758ee23d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -568,6 +568,7 @@ static struct dentry *affs_fh_to_parent(struct super_block *sb, struct fid *fid, } const struct export_operations affs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = affs_fh_to_dentry, .fh_to_parent = affs_fh_to_parent, .get_parent = affs_get_parent, diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index df13a4f9a6e3..c08c2c7d6fbb 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -24,7 +24,6 @@ config BCACHEFS_FS select XXHASH select SRCU select SYMBOLIC_ERRNAME - select MEAN_AND_VARIANCE help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. @@ -42,7 +41,6 @@ config BCACHEFS_POSIX_ACL config BCACHEFS_DEBUG_TRANSACTIONS bool "bcachefs runtime info" depends on BCACHEFS_FS - default y help This makes the list of running btree transactions available in debugfs. @@ -78,7 +76,7 @@ config BCACHEFS_NO_LATENCY_ACCT config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT - select MEAN_AND_VARIANCE + depends on BCACHEFS_FS default KUNIT_ALL_TESTS help This option enables the kunit tests for mean_and_variance module. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 0749731b9072..45b64f89258c 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -70,6 +70,7 @@ bcachefs-y := \ reflink.o \ replicas.o \ sb-clean.o \ + sb-errors.o \ sb-members.o \ siphash.o \ six.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 2d516207e223..1fec0e67891f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -192,123 +192,109 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) return DIV_ROUND_UP(bytes, sizeof(u64)); } -int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + int ret = 0; /* allow for unknown fields */ - if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { - prt_printf(err, "incorrect value size (%zu < %u)", - bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, + alloc_v1_val_size_bad, + "incorrect value size (%zu < %u)", + bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); +fsck_err: + return ret; } -int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; + int ret = 0; - if (bch2_alloc_unpack_v2(&u, k)) { - prt_printf(err, "unpack error"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, + alloc_v2_unpack_error, + "unpack error"); +fsck_err: + return ret; } -int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_alloc_unpacked u; + int ret = 0; - if (bch2_alloc_unpack_v3(&u, k)) { - prt_printf(err, "unpack error"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, + alloc_v2_unpack_error, + "unpack error"); +fsck_err: + return ret; } -int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + int ret = 0; - if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { - prt_printf(err, "bad val size (%u > %zu)", - alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err, + alloc_v4_val_size_bad, + "bad val size (%u > %zu)", + alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); - if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && - BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { - prt_printf(err, "invalid backpointers_start"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, + alloc_v4_backpointers_start_bad, + "invalid backpointers_start"); - if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { - prt_printf(err, "invalid data type (got %u should be %u)", - a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, + alloc_key_data_type_bad, + "invalid data type (got %u should be %u)", + a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - if (a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe) { - prt_printf(err, "empty data type free but have data"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe, c, err, + alloc_key_empty_but_have_data, + "empty data type free but have data"); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - if (!a.v->dirty_sectors) { - prt_printf(err, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(!a.v->dirty_sectors, c, err, + alloc_key_dirty_sectors_0, + "data_type %s but dirty_sectors==0", + bch2_data_types[a.v->data_type]); break; case BCH_DATA_cached: - if (!a.v->cached_sectors || - a.v->dirty_sectors || - a.v->stripe) { - prt_printf(err, "data type inconsistency"); - return -BCH_ERR_invalid_bkey; - } - - if (!a.v->io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { - prt_printf(err, "cached bucket with read_time == 0"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(!a.v->cached_sectors || + a.v->dirty_sectors || + a.v->stripe, c, err, + alloc_key_cached_inconsistency, + "data type inconsistency"); + + bkey_fsck_err_on(!a.v->io_time[READ] && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + c, err, + alloc_key_cached_but_read_time_zero, + "cached bucket with read_time == 0"); break; case BCH_DATA_stripe: break; } - - return 0; -} - -static inline u64 swab40(u64 x) -{ - return (((x & 0x00000000ffULL) << 32)| - ((x & 0x000000ff00ULL) << 16)| - ((x & 0x0000ff0000ULL) >> 0)| - ((x & 0x00ff000000ULL) >> 16)| - ((x & 0xff00000000ULL) >> 32)); +fsck_err: + return ret; } void bch2_alloc_v4_swab(struct bkey_s k) @@ -324,6 +310,7 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->io_time[1] = swab64(a->io_time[1]); a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); + a->fragmentation_lru = swab64(a->fragmentation_lru); bps = alloc_v4_backpointers(a); for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { @@ -521,17 +508,18 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) : 0; } -int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { - prt_printf(err, "bad val size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, + bucket_gens_val_size_bad, + "bad val size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); +fsck_err: + return ret; } void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) @@ -727,7 +715,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans, "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" " for %s", set ? "setting" : "clearing", - bch2_btree_ids[btree], + bch2_btree_id_str(btree), iter.pos.inode, iter.pos.offset, bch2_bkey_types[old.k->type], @@ -986,6 +974,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, int ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, + alloc_key_to_missing_dev_bucket, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) return bch2_btree_delete_at(trans, alloc_iter, 0); @@ -1005,7 +994,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (k.k->type != discard_key_type && (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" + fsck_err(c, need_discard_key_wrong, + "incorrect key in need_discard btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[discard_key_type], @@ -1035,7 +1025,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (k.k->type != freespace_key_type && (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" + fsck_err(c, freespace_key_wrong, + "incorrect key in freespace btree (got %s should be %s)\n" " %s", bch2_bkey_types[k.k->type], bch2_bkey_types[freespace_key_type], @@ -1066,7 +1057,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (a->gen != alloc_gen(k, gens_offset) && (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n" + fsck_err(c, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" " %s", alloc_gen(k, gens_offset), a->gen, (printbuf_reset(&buf), @@ -1124,7 +1116,8 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, if (k.k->type != KEY_TYPE_set && (c->opts.reconstruct_alloc || - fsck_err(c, "hole in alloc btree missing in freespace btree\n" + fsck_err(c, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" " device %llu buckets %llu-%llu", freespace_iter->pos.inode, freespace_iter->pos.offset, @@ -1187,6 +1180,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, for (i = gens_offset; i < gens_end_offset; i++) { if (fsck_err_on(g.v.gens[i], c, + bucket_gens_hole_wrong, "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", bucket_gens_pos_to_alloc(k.k->p, i).inode, bucket_gens_pos_to_alloc(k.k->p, i).offset, @@ -1244,8 +1238,9 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr return ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, + need_discard_freespace_key_to_invalid_dev_bucket, "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) + bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) goto delete; a = bch2_alloc_to_v4(alloc_k, &a_convert); @@ -1253,9 +1248,10 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr if (fsck_err_on(a->data_type != state || (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a)), c, + need_discard_freespace_key_bad, "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_ids[iter->btree_id], + bch2_btree_id_str(iter->btree_id), iter->pos.inode, iter->pos.offset, a->data_type == state, @@ -1320,6 +1316,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, dev_exists = bch2_dev_exists2(c, k.k->p.inode); if (!dev_exists) { if (fsck_err_on(!dev_exists, c, + bucket_gens_to_invalid_dev, "bucket_gens key for invalid device:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -1330,6 +1327,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, ca = bch_dev_bkey_exists(c, k.k->p.inode); if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, c, + bucket_gens_to_invalid_buckets, "bucket_gens key for invalid buckets:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -1338,6 +1336,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, for (b = start; b < ca->mi.first_bucket; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; @@ -1345,6 +1344,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, for (b = ca->mi.nbuckets; b < end; b++) if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + bucket_gens_nonzero_for_invalid_buckets, "bucket_gens key has nonzero gen for invalid bucket")) { g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; need_update = true; @@ -1495,11 +1495,13 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, return ret; if (fsck_err_on(!a->io_time[READ], c, + alloc_key_cached_but_read_time_zero, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + alloc_key_to_missing_lru_entry, "missing lru entry\n" " %s", (printbuf_reset(&buf), @@ -2075,6 +2077,17 @@ void bch2_recalc_capacity(struct bch_fs *c) closure_wake_up(&c->freelist_wait); } +u64 bch2_min_rw_member_capacity(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + u64 ret = U64_MAX; + + for_each_rw_member(ca, c, i) + ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + return ret; +} + static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 97042067d2a9..73faf99a222a 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -149,13 +149,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -193,7 +193,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .min_val_size = 48, \ }) -int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -249,6 +249,7 @@ int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); int bch2_fs_freespace_init(struct bch_fs *); void bch2_recalc_capacity(struct bch_fs *); +u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 3bc4abd3d7d5..b85c7765272f 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { - struct btree_iter iter; - struct bkey_s_c k; + struct btree_iter iter, citer; + struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); - u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); + u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; int ret; + + /* + * Scan with an uncached iterator to avoid polluting the key cache. An + * uncached iter will return a cached key if one exists, but if not + * there is no other underlying protection for the associated key cache + * slot. To avoid racing bucket allocations, look up the cached key slot + * of any likely allocation candidate before attempting to proceed with + * the allocation. This provides proper exclusion on the associated + * bucket. + */ again: for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_SLOTS, k, ret) { @@ -419,25 +430,38 @@ again: continue; a = bch2_alloc_to_v4(k, &a_convert); - if (a->data_type != BCH_DATA_free) continue; + /* now check the cached key to serialize concurrent allocs of the bucket */ + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ret = bkey_err(ck); + if (ret) + break; + + a = bch2_alloc_to_v4(ck, &a_convert); + if (a->data_type != BCH_DATA_free) + goto next; + s->buckets_seen++; ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); +next: + citer.path->preserve = false; + bch2_trans_iter_exit(trans, &citer); if (ob) break; } bch2_trans_iter_exit(trans, &iter); + alloc_cursor = iter.pos.offset; ca->alloc_cursor = alloc_cursor; if (!ob && ret) ob = ERR_PTR(ret); - if (!ob && alloc_cursor > alloc_start) { - alloc_cursor = alloc_start; + if (!ob && alloc_start > first_bucket) { + alloc_cursor = alloc_start = first_bucket; goto again; } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index cc856150a948..ef02c9bb0354 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -5,6 +5,7 @@ #include "backpointers.h" #include "btree_cache.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "btree_write_buffer.h" #include "error.h" @@ -37,25 +38,26 @@ static bool extent_matches_bp(struct bch_fs *c, return false; } -int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + int ret = 0; - if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { - prt_str(err, "backpointer at wrong pos"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + c, err, + backpointer_pos_wrong, + "backpointer at wrong pos"); +fsck_err: + return ret; } void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) { prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", - bch2_btree_ids[bp->btree_id], + bch2_btree_id_str(bp->btree_id), bp->level, (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), @@ -76,7 +78,7 @@ void bch2_backpointer_swab(struct bkey_s k) { struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - bp.v->bucket_offset = swab32(bp.v->bucket_offset); + bp.v->bucket_offset = swab40(bp.v->bucket_offset); bp.v->bucket_len = swab32(bp.v->bucket_len); bch2_bpos_swab(&bp.v->pos); } @@ -219,18 +221,22 @@ out: static void backpointer_not_found(struct btree_trans *trans, struct bpos bp_pos, struct bch_backpointer bp, - struct bkey_s_c k, - const char *thing_it_points_to) + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + /* + * If we're using the btree write buffer, the backpointer we were + * looking at may have already been deleted - failure to find what it + * pointed to is not an error: + */ if (likely(!bch2_backpointers_no_use_write_buffer)) return; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", - thing_it_points_to); + bp.level ? "btree node" : "extent"); prt_printf(&buf, "bucket: "); bch2_bpos_to_text(&buf, bucket); prt_printf(&buf, "\n "); @@ -256,56 +262,37 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct bch_backpointer bp, unsigned iter_flags) { - struct bch_fs *c = trans->c; - struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); - struct bpos bucket = bp_pos_to_bucket(c, bp_pos); - struct bkey_s_c k; - - bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, - 0, - min(bp.level, r->level), - iter_flags); - k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (bp.level == r->level + 1) - k = bkey_i_to_s_c(&r->key); - - if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) - return k; - - bch2_trans_iter_exit(trans, iter); + if (likely(!bp.level)) { + struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + struct bkey_s_c k; + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, 0, + iter_flags); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } - if (unlikely(bch2_backpointers_no_use_write_buffer)) { - if (bp.level) { - struct btree *b; + if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; - /* - * If a backpointer for a btree node wasn't found, it may be - * because it was overwritten by a new btree node that hasn't - * been written out yet - backpointer_get_node() checks for - * this: - */ - b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); - if (!IS_ERR_OR_NULL(b)) - return bkey_i_to_s_c(&b->key); + bch2_trans_iter_exit(trans, iter); + backpointer_not_found(trans, bp_pos, bp, k); + return bkey_s_c_null; + } else { + struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + if (IS_ERR_OR_NULL(b)) { bch2_trans_iter_exit(trans, iter); - - if (IS_ERR(b)) - return bkey_s_c_err(PTR_ERR(b)); - return bkey_s_c_null; + return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; } - - backpointer_not_found(trans, bp_pos, bp, k, "extent"); + return bkey_i_to_s_c(&b->key); } - - return bkey_s_c_null; } struct btree *bch2_backpointer_get_node(struct btree_trans *trans, @@ -329,6 +316,8 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, if (IS_ERR(b)) goto err; + BUG_ON(b->c.level != bp.level - 1); + if (b && extent_matches_bp(c, bp.btree_id, bp.level, bkey_i_to_s_c(&b->key), bucket, bp)) @@ -337,8 +326,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, if (b && btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_not_found(trans, bp_pos, bp, - bkey_i_to_s_c(&b->key), "btree node"); + backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); b = NULL; } err: @@ -356,6 +344,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ int ret = 0; if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, + backpointer_to_missing_device, "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, bp_iter, 0); @@ -369,6 +358,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ goto out; if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, + backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { @@ -453,14 +443,14 @@ fsck_err: return ret; missing: prt_printf(&buf, "missing backpointer for btree=%s l=%u ", - bch2_btree_ids[bp.btree_id], bp.level); + bch2_btree_id_str(bp.btree_id), bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || c->opts.reconstruct_alloc || - fsck_err(c, "%s", buf.buf)) + fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; @@ -793,7 +783,9 @@ static int check_one_backpointer(struct btree_trans *trans, } if (fsck_err_on(!k.k, c, - "backpointer for missing extent\n %s", + backpointer_to_missing_ptr, + "backpointer for missing %s\n %s", + bp.v->level ? "btree node" : "extent", (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); goto out; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 547e0617602a..ab866feeaf66 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -7,7 +7,16 @@ #include "buckets.h" #include "super.h" -int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, +static inline u64 swab40(u64 x) +{ + return (((x & 0x00000000ffULL) << 32)| + ((x & 0x000000ff00ULL) << 16)| + ((x & 0x0000ff0000ULL) >> 0)| + ((x & 0x00ff000000ULL) >> 16)| + ((x & 0xff00000000ULL) >> 32)); +} + +int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, enum bkey_invalid_flags, struct printbuf *); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h index 1fbed1f8378d..be2edced5213 100644 --- a/fs/bcachefs/bbpos.h +++ b/fs/bcachefs/bbpos.h @@ -2,20 +2,9 @@ #ifndef _BCACHEFS_BBPOS_H #define _BCACHEFS_BBPOS_H +#include "bbpos_types.h" #include "bkey_methods.h" - -struct bbpos { - enum btree_id btree; - struct bpos pos; -}; - -static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) -{ - return (struct bbpos) { btree, pos }; -} - -#define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#include "btree_cache.h" static inline int bbpos_cmp(struct bbpos l, struct bbpos r) { @@ -40,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos) static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) { - prt_str(out, bch2_btree_ids[pos.btree]); + prt_str(out, bch2_btree_id_str(pos.btree)); prt_char(out, ':'); bch2_bpos_to_text(out, pos.pos); } diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h new file mode 100644 index 000000000000..5198e94cf3b8 --- /dev/null +++ b/fs/bcachefs/bbpos_types.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BBPOS_TYPES_H +#define _BCACHEFS_BBPOS_TYPES_H + +struct bbpos { + enum btree_id btree; + struct bpos pos; +}; + +static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) +{ + return (struct bbpos) { btree, pos }; +} + +#define BBPOS_MIN BBPOS(0, POS_MIN) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) + +#endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 53ffa88cae16..9cb8684959ee 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -209,6 +209,7 @@ #include "nocow_locking_types.h" #include "opts.h" #include "recovery_types.h" +#include "sb-errors_types.h" #include "seqmutex.h" #include "util.h" @@ -418,6 +419,7 @@ enum bch_time_stats { #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" #include "clock_types.h" +#include "disk_groups_types.h" #include "ec_types.h" #include "journal_types.h" #include "keylist_types.h" @@ -463,6 +465,7 @@ enum gc_phase { GC_PHASE_BTREE_snapshot_trees, GC_PHASE_BTREE_deleted_inodes, GC_PHASE_BTREE_logged_ops, + GC_PHASE_BTREE_rebalance_work, GC_PHASE_PENDING_DELETE, }; @@ -500,6 +503,8 @@ struct bch_dev { * Committed by bch2_write_super() -> bch_fs_mi_update() */ struct bch_member_cpu mi; + atomic64_t errors[BCH_MEMBER_ERROR_NR]; + __uuid_t uuid; char name[BDEVNAME_SIZE]; @@ -578,7 +583,7 @@ enum { BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ BCH_FS_NEED_ANOTHER_GC, - BCH_FS_HAVE_DELETED_SNAPSHOTS, + BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, /* errors: */ BCH_FS_ERROR, @@ -938,9 +943,6 @@ struct bch_fs { struct list_head moving_context_list; struct mutex moving_context_lock; - struct list_head data_progress_list; - struct mutex data_progress_lock; - /* REBALANCE */ struct bch_fs_rebalance rebalance; @@ -991,11 +993,6 @@ struct bch_fs { struct bio_set dio_read_bioset; struct bio_set nocow_flush_bioset; - /* ERRORS */ - struct list_head fsck_errors; - struct mutex fsck_error_lock; - bool fsck_alloc_err; - /* QUOTAS */ struct bch_memquota_type quotas[QTYP_NR]; @@ -1044,6 +1041,14 @@ struct bch_fs { struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; + + /* ERRORS */ + struct list_head fsck_error_msgs; + struct mutex fsck_error_msgs_lock; + bool fsck_alloc_msgs_err; + + bch_sb_errors_cpu fsck_error_counts; + struct mutex fsck_error_counts_lock; }; extern struct wait_queue_head bch2_read_only_wait; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 99749f3315fe..0a750953ff92 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -613,31 +613,17 @@ struct bch_extent_stripe_ptr { #endif }; -struct bch_extent_reservation { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:22, - replicas:4, - generation:32; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 generation:32, - replicas:4, - unused:22, - type:6; -#endif -}; - struct bch_extent_rebalance { #if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:7, - unused:33, - compression:8, + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ target:16; #elif defined (__BIG_ENDIAN_BITFIELD) __u64 target:16, compression:8, - unused:33, - type:7; + unused:34, + type:6; #endif }; @@ -838,34 +824,30 @@ enum inode_opt_id { Inode_opt_nr, }; -enum { - /* - * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL - * flags) - */ - __BCH_INODE_SYNC = 0, - __BCH_INODE_IMMUTABLE = 1, - __BCH_INODE_APPEND = 2, - __BCH_INODE_NODUMP = 3, - __BCH_INODE_NOATIME = 4, - - __BCH_INODE_I_SIZE_DIRTY = 5, /* obsolete */ - __BCH_INODE_I_SECTORS_DIRTY = 6, /* obsolete */ - __BCH_INODE_UNLINKED = 7, - __BCH_INODE_BACKPTR_UNTRUSTED = 8, - - /* bits 20+ reserved for packed fields below: */ -}; - -#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); @@ -1232,7 +1214,8 @@ struct bch_sb_field { x(journal_seq_blacklist, 8) \ x(journal_v2, 9) \ x(counters, 10) \ - x(members_v2, 11) + x(members_v2, 11) \ + x(errors, 12) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1282,6 +1265,18 @@ enum bch_iops_measurement { BCH_IOPS_NR }; +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + struct bch_member { __uuid_t uuid; __le64 nbuckets; /* device size */ @@ -1292,6 +1287,9 @@ struct bch_member { __le64 flags; __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; }; #define BCH_MEMBER_V1_BYTES 56 @@ -1615,11 +1613,20 @@ struct journal_seq_blacklist_entry { struct bch_sb_field_journal_seq_blacklist { struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; - struct journal_seq_blacklist_entry start[0]; - __u64 _data[]; +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; }; +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + /* Superblock: */ /* @@ -1682,7 +1689,9 @@ struct bch_sb_field_journal_seq_blacklist { x(snapshot_skiplists, BCH_VERSION(1, 1), \ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ x(deleted_inodes, BCH_VERSION(1, 2), \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) + BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ + x(rebalance_work, BCH_VERSION(1, 3), \ + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1693,7 +1702,7 @@ enum bcachefs_metadata_version { }; static const __maybe_unused -unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; +unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -2247,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); enum btree_id_flags { BTREE_ID_EXTENTS = BIT(0), BTREE_ID_SNAPSHOTS = BIT(1), - BTREE_ID_DATA = BIT(2), + BTREE_ID_SNAPSHOT_FIELD = BIT(2), + BTREE_ID_DATA = BIT(3), }; #define BCH_BTREE_IDS() \ @@ -2302,11 +2312,13 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_bucket_gens)) \ x(snapshot_trees, 15, 0, \ BIT_ULL(KEY_TYPE_snapshot_tree)) \ - x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ + x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ BIT_ULL(KEY_TYPE_set)) \ x(logged_ops, 17, 0, \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)) + BIT_ULL(KEY_TYPE_logged_op_finsert)) \ + x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 518450209236..831be01809f2 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -92,19 +92,15 @@ enum bkey_lr_packed { #define bkey_lr_packed(_l, _r) \ ((_l)->format + ((_r)->format << 1)) -#define bkey_copy(_dst, _src) \ -do { \ - BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ - !type_is(_dst, struct bkey_packed *)); \ - BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ - !type_is(_src, struct bkey_packed *)); \ - EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ - (u64 *) (_dst) < (u64 *) (_src) + \ - ((struct bkey *) (_src))->u64s); \ - \ - memcpy_u64s_small((_dst), (_src), \ - ((struct bkey *) (_src))->u64s); \ -} while (0) +static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) +{ + memcpy_u64s_small(dst, src, src->u64s); +} + +static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) +{ + memcpy_u64s_small(dst, src, src->k.u64s); +} struct btree; diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index d9fb1fc81f1e..761f5e33b1e6 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "backpointers.h" #include "bkey_methods.h" +#include "btree_cache.h" #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" @@ -25,7 +26,7 @@ const char * const bch2_bkey_types[] = { NULL }; -static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, +static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { return 0; @@ -39,23 +40,24 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, .key_invalid = deleted_key_invalid, \ }) -static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, +static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_bytes(k.k)) { - prt_printf(err, "incorrect value size (%zu != 0)", - bkey_val_bytes(k.k)); - return -BCH_ERR_invalid_bkey; - } - - return 0; + int ret = 0; + + bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, + bkey_val_size_nonzero, + "incorrect value size (%zu != 0)", + bkey_val_bytes(k.k)); +fsck_err: + return ret; } #define bch2_bkey_ops_error ((struct bkey_ops) { \ .key_invalid = empty_val_key_invalid, \ }) -static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, +static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { return 0; @@ -70,7 +72,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, .key_invalid = empty_val_key_invalid, \ }) -static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, +static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { return 0; @@ -91,18 +93,6 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, .val_to_text = key_type_inline_data_to_text, \ }) -static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, - enum bkey_invalid_flags flags, struct printbuf *err) -{ - if (bkey_val_bytes(k.k)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_cookie)); - return -BCH_ERR_invalid_bkey; - } - - return 0; -} - static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { bch2_key_resize(l.k, l.k->size + r.k->size); @@ -110,7 +100,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ } #define bch2_bkey_ops_set ((struct bkey_ops) { \ - .key_invalid = key_type_set_invalid, \ + .key_invalid = empty_val_key_invalid, \ .key_merge = key_type_set_merge, \ }) @@ -128,84 +118,95 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + int ret = 0; - if (bkey_val_bytes(k.k) < ops->min_val_size) { - prt_printf(err, "bad val size (%zu < %u)", - bkey_val_bytes(k.k), ops->min_val_size); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, + bkey_val_size_too_small, + "bad val size (%zu < %u)", + bkey_val_bytes(k.k), ops->min_val_size); if (!ops->key_invalid) return 0; - return ops->key_invalid(c, k, flags, err); + ret = ops->key_invalid(c, k, flags, err); +fsck_err: + return ret; } static u64 bch2_key_types_allowed[] = { -#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, - BCH_BTREE_IDS() -#undef x [BKEY_TYPE_btree] = BIT_ULL(KEY_TYPE_deleted)| BIT_ULL(KEY_TYPE_btree_ptr)| BIT_ULL(KEY_TYPE_btree_ptr_v2), +#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, + BCH_BTREE_IDS() +#undef x }; +const char *bch2_btree_node_type_str(enum btree_node_type type) +{ + return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); +} + int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, enum bkey_invalid_flags flags, struct printbuf *err) { - if (k.k->u64s < BKEY_U64s) { - prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - if (flags & BKEY_INVALID_COMMIT && - !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) { - prt_printf(err, "invalid key type for btree %s (%s)", - bch2_btree_ids[type], bch2_bkey_types[k.k->type]); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, + bkey_u64s_too_small, + "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { - if (k.k->size == 0) { - prt_printf(err, "size == 0"); - return -BCH_ERR_invalid_bkey; - } + if (type >= BKEY_TYPE_NR) + return 0; - if (k.k->size > k.k->p.offset) { - prt_printf(err, "size greater than offset (%u > %llu)", - k.k->size, k.k->p.offset); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, + bkey_invalid_type_for_btree, + "invalid key type for btree %s (%s)", + bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + bkey_fsck_err_on(k.k->size == 0, c, err, + bkey_extent_size_zero, + "size == 0"); + + bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, + bkey_extent_size_greater_than_offset, + "size greater than offset (%u > %llu)", + k.k->size, k.k->p.offset); } else { - if (k.k->size) { - prt_printf(err, "size != 0"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->size, c, err, + bkey_size_nonzero, + "size != 0"); } if (type != BKEY_TYPE_btree) { - if (!btree_type_has_snapshots((enum btree_id) type) && - k.k->p.snapshot) { - prt_printf(err, "nonzero snapshot"); - return -BCH_ERR_invalid_bkey; - } - - if (btree_type_has_snapshots((enum btree_id) type) && - !k.k->p.snapshot) { - prt_printf(err, "snapshot == 0"); - return -BCH_ERR_invalid_bkey; + enum btree_id btree = type - 1; + + if (btree_type_has_snapshots(btree)) { + bkey_fsck_err_on(!k.k->p.snapshot, c, err, + bkey_snapshot_zero, + "snapshot == 0"); + } else if (!btree_type_has_snapshot_field(btree)) { + bkey_fsck_err_on(k.k->p.snapshot, c, err, + bkey_snapshot_nonzero, + "nonzero snapshot"); + } else { + /* + * btree uses snapshot field but it's not required to be + * nonzero + */ } - if (bkey_eq(k.k->p, POS_MAX)) { - prt_printf(err, "key at POS_MAX"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, + bkey_at_pos_max, + "key at POS_MAX"); } - - return 0; +fsck_err: + return ret; } int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -217,20 +218,20 @@ int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, bch2_bkey_val_invalid(c, k, flags, err); } -int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, - struct printbuf *err) +int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, struct printbuf *err) { - if (bpos_lt(k.k->p, b->data->min_key)) { - prt_printf(err, "key before start of btree node"); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - if (bpos_gt(k.k->p, b->data->max_key)) { - prt_printf(err, "key past end of btree node"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, + bkey_before_start_of_btree_node, + "key before start of btree node"); - return 0; + bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, + bkey_after_end_of_btree_node, + "key past end of btree node"); +fsck_err: + return ret; } void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 668f595e2fcf..3a370b7087ac 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -21,7 +21,7 @@ extern const struct bkey_ops bch2_bkey_null_ops; * being read or written; more aggressive checks can be enabled when rw == WRITE. */ struct bkey_ops { - int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, + int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -55,7 +55,8 @@ int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, enum bkey_invalid_flags, struct printbuf *); int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, enum bkey_invalid_flags, struct printbuf *); -int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); +int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, + struct bkey_s_c, struct printbuf *); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); @@ -119,16 +120,6 @@ enum btree_update_flags { #define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ - ((1U << KEY_TYPE_alloc)| \ - (1U << KEY_TYPE_alloc_v2)| \ - (1U << KEY_TYPE_alloc_v3)| \ - (1U << KEY_TYPE_alloc_v4)| \ - (1U << KEY_TYPE_stripe)| \ - (1U << KEY_TYPE_inode)| \ - (1U << KEY_TYPE_inode_v2)| \ - (1U << KEY_TYPE_snapshot)) - static inline int bch2_trans_mark_key(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c index b9aa027c881b..bcca9e76a0b4 100644 --- a/fs/bcachefs/bkey_sort.c +++ b/fs/bcachefs/bkey_sort.c @@ -106,7 +106,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, while ((k = sort_iter_peek(iter))) { if (!bkey_deleted(k) && !should_drop_next_key(iter)) { - bkey_copy(out, k); + bkey_p_copy(out, k); btree_keys_account_key_add(&nr, 0, out); out = bkey_p_next(out); } @@ -137,7 +137,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, continue; if (!transform) - bkey_copy(out, in); + bkey_p_copy(out, in); else if (bch2_bkey_transform(out_f, out, bkey_packed(in) ? in_f : &bch2_bkey_format_current, in)) out->format = KEY_FORMAT_LOCAL_BTREE; @@ -191,7 +191,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); set_bkeyp_val_u64s(f, out, 0); } else { - bkey_copy(out, in); + bkey_p_copy(out, in); } out->needs_whiteout |= needs_whiteout; out = bkey_p_next(out); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 5e5858191905..47e7770d0583 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -472,7 +472,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) mutex_init(&c->verify_lock); - shrink = shrinker_alloc(0, "%s/btree_cache", c->name); + shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; bc->shrink = shrink; @@ -785,12 +785,12 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) "btree node header doesn't match ptr\n" "btree %s level %u\n" "ptr: ", - bch2_btree_ids[b->c.btree_id], b->c.level); + bch2_btree_id_str(b->c.btree_id), b->c.level); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_printf(&buf, "\nheader: btree %s level %llu\n" "min ", - bch2_btree_ids[BTREE_NODE_ID(b->data)], + bch2_btree_id_str(BTREE_NODE_ID(b->data)), BTREE_NODE_LEVEL(b->data)); bch2_bpos_to_text(&buf, b->data->min_key); @@ -1153,8 +1153,21 @@ wait_on_io: six_unlock_intent(&b->c.lock); } -void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, - const struct btree *b) +const char *bch2_btree_id_str(enum btree_id btree) +{ + return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; +} + +void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) +{ + prt_printf(out, "%s level %u/%u\n ", + bch2_btree_id_str(b->c.btree_id), + b->c.level, + bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { struct bset_stats stats; diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 1e562b6efa62..cfb80b201d61 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -123,8 +123,9 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) return bch2_btree_id_root(c, b->c.btree_id)->b; } -void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, - const struct btree *); +const char *bch2_btree_id_str(enum btree_id); +void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 693ed067b1a7..0b5d09c8475d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -95,15 +95,15 @@ static int bch2_gc_check_topology(struct bch_fs *c, bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_ids[b->c.btree_id], b->c.level, - buf1.buf, buf2.buf) && - should_restart_for_topology_repair(c)) { + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + btree_node_topology_bad_min_key, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; @@ -122,14 +122,12 @@ static int bch2_gc_check_topology(struct bch_fs *c, bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); bch2_bpos_to_text(&buf2, node_end); - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, + if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, + btree_node_topology_bad_max_key, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); @@ -287,10 +285,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, cur->data->min_key), c, + btree_node_topology_overwritten_by_next_node, "btree node overwritten by next node at btree %s level %u:\n" " node %s\n" " next %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) { ret = DROP_PREV_NODE; goto out; @@ -298,10 +297,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, bpos_predecessor(cur->data->min_key)), c, + btree_node_topology_bad_max_key, "btree node with incorrect max_key at btree %s level %u:\n" " node %s\n" " next %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) ret = set_node_max(c, prev, bpos_predecessor(cur->data->min_key)); @@ -310,20 +310,22 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, if (mustfix_fsck_err_on(bpos_ge(expected_start, cur->data->max_key), c, + btree_node_topology_overwritten_by_prev_node, "btree node overwritten by prev node at btree %s level %u:\n" " prev %s\n" " node %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) { ret = DROP_THIS_NODE; goto out; } if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, + btree_node_topology_bad_min_key, "btree node with incorrect min_key at btree %s level %u:\n" " prev %s\n" " node %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) ret = set_node_min(c, cur, expected_start); } @@ -344,10 +346,11 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b, bch2_bpos_to_text(&buf2, b->key.k.p); if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, + btree_node_topology_bad_max_key, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", - bch2_btree_ids[b->c.btree_id], b->c.level, + bch2_btree_id_str(b->c.btree_id), b->c.level, buf1.buf, buf2.buf)) { ret = set_node_max(c, child, b->key.k.p); if (ret) @@ -396,9 +399,10 @@ again: bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); if (mustfix_fsck_err_on(ret == -EIO, c, + btree_node_unreadable, "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); @@ -504,9 +508,10 @@ again: bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (mustfix_fsck_err_on(!have_child, c, + btree_node_topology_interior_node_empty, "empty interior btree node at btree %s level %u\n" " %s", - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf)) ret = DROP_THIS_NODE; err: @@ -582,7 +587,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (!g->gen_valid && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + fsck_err(c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -599,7 +605,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (gen_cmp(p.ptr.gen, g->gen) > 0 && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + fsck_err(c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -620,7 +627,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + fsck_err(c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -631,7 +639,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && (c->opts.reconstruct_alloc || - fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + fsck_err(c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), bch2_data_types[ptr_data_type(k->k, &p.ptr)], @@ -645,6 +654,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id if (fsck_err_on(bucket_data_type(g->data_type) && bucket_data_type(g->data_type) != data_type, c, + ptr_bucket_data_type_mismatch, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -664,6 +674,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); if (fsck_err_on(!m || !m->alive, c, + ptr_to_missing_stripe, "pointer to nonexistent stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -672,6 +683,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id do_update = true; if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, + ptr_to_incorrect_stripe, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -811,6 +823,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, goto err; if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + bkey_version_in_future, "key version number higher than recorded: %llu > %llu", k->k->version.lo, atomic64_read(&c->key_version))) @@ -968,9 +981,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b FSCK_CAN_FIX| FSCK_CAN_IGNORE| FSCK_NO_RATELIMIT, + btree_node_read_error, "Unreadable btree node at btree %s level %u:\n" " %s", - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level - 1, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && @@ -1025,6 +1039,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, + btree_root_bad_min_key, "btree root with incorrect min_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1034,6 +1049,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->max_key); if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, + btree_root_bad_max_key, "btree root with incorrect max_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1207,16 +1223,16 @@ static int bch2_gc_done(struct bch_fs *c, percpu_down_write(&c->mark_lock); -#define copy_field(_f, _msg, ...) \ +#define copy_field(_err, _f, _msg, ...) \ if (dst->_f != src->_f && \ (!verify || \ - fsck_err(c, _msg ": got %llu, should be %llu" \ + fsck_err(c, _err, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f))) \ dst->_f = src->_f -#define copy_dev_field(_f, _msg, ...) \ - copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) -#define copy_fs_field(_f, _msg, ...) \ - copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) +#define copy_dev_field(_err, _f, _msg, ...) \ + copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) +#define copy_fs_field(_err, _f, _msg, ...) \ + copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) for (i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); @@ -1227,13 +1243,17 @@ static int bch2_gc_done(struct bch_fs *c, bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, dev_usage_u64s()); - copy_dev_field(buckets_ec, "buckets_ec"); - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); - copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); - copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + copy_dev_field(dev_usage_buckets_wrong, + d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(dev_usage_sectors_wrong, + d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(dev_usage_fragmented_wrong, + d[i].fragmented, "%s fragmented", bch2_data_types[i]); } + + copy_dev_field(dev_usage_buckets_ec_wrong, + buckets_ec, "buckets_ec"); } { @@ -1242,17 +1262,24 @@ static int bch2_gc_done(struct bch_fs *c, struct bch_fs_usage *src = (void *) bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); - copy_fs_field(hidden, "hidden"); - copy_fs_field(btree, "btree"); + copy_fs_field(fs_usage_hidden_wrong, + hidden, "hidden"); + copy_fs_field(fs_usage_btree_wrong, + btree, "btree"); if (!metadata_only) { - copy_fs_field(data, "data"); - copy_fs_field(cached, "cached"); - copy_fs_field(reserved, "reserved"); - copy_fs_field(nr_inodes,"nr_inodes"); + copy_fs_field(fs_usage_data_wrong, + data, "data"); + copy_fs_field(fs_usage_cached_wrong, + cached, "cached"); + copy_fs_field(fs_usage_reserved_wrong, + reserved, "reserved"); + copy_fs_field(fs_usage_nr_inodes_wrong, + nr_inodes,"nr_inodes"); for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(persistent_reserved[i], + copy_fs_field(fs_usage_persistent_reserved_wrong, + persistent_reserved[i], "persistent_reserved[%i]", i); } @@ -1268,7 +1295,8 @@ static int bch2_gc_done(struct bch_fs *c, printbuf_reset(&buf); bch2_replicas_entry_to_text(&buf, e); - copy_fs_field(replicas[i], "%s", buf.buf); + copy_fs_field(fs_usage_replicas_wrong, + replicas[i], "%s", buf.buf); } } @@ -1404,6 +1432,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (c->opts.reconstruct_alloc || fsck_err_on(new.data_type != gc.data_type, c, + alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", iter->pos.inode, iter->pos.offset, @@ -1412,9 +1441,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, bch2_data_types[gc.data_type])) new.data_type = gc.data_type; -#define copy_bucket_field(_f) \ +#define copy_bucket_field(_errtype, _f) \ if (c->opts.reconstruct_alloc || \ - fsck_err_on(new._f != gc._f, c, \ + fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ @@ -1423,11 +1452,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new._f, gc._f)) \ new._f = gc._f; \ - copy_bucket_field(gen); - copy_bucket_field(dirty_sectors); - copy_bucket_field(cached_sectors); - copy_bucket_field(stripe_redundancy); - copy_bucket_field(stripe); + copy_bucket_field(alloc_key_gen_wrong, + gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, + dirty_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, + cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, + stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, + stripe_redundancy); #undef copy_bucket_field if (!bch2_alloc_v4_cmp(*old, new)) @@ -1584,6 +1618,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, } if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, + reflink_v_refcount_wrong, "reflink key has wrong refcount:\n" " %s\n" " should be %u", @@ -1709,7 +1744,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, if (bad) bch2_bkey_val_to_text(&buf, c, k); - if (fsck_err_on(bad, c, "%s", buf.buf)) { + if (fsck_err_on(bad, c, stripe_sector_count_wrong, + "%s", buf.buf)) { struct bkey_i_stripe *new; new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); @@ -1954,19 +1990,17 @@ int bch2_gc_gens(struct bch_fs *c) trans = bch2_trans_get(c); for_each_member_device(ca, c, i) { - struct bucket_gens *gens; + struct bucket_gens *gens = bucket_gens(ca); BUG_ON(ca->oldest_gen); - ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); + ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { percpu_ref_put(&ca->ref); ret = -BCH_ERR_ENOMEM_gc_gens; goto err; } - gens = bucket_gens(ca); - for (b = gens->first_bucket; b < gens->nbuckets; b++) ca->oldest_gen[b] = gens->b[b]; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index a869cf6ac7c6..37d896edb06e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -184,7 +184,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) k = new_whiteouts; while (ptrs != ptrs_end) { - bkey_copy(k, *ptrs); + bkey_p_copy(k, *ptrs); k = bkey_p_next(k); ptrs++; } @@ -260,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) n = bkey_p_next(k); if (!bkey_deleted(k)) { - bkey_copy(out, k); + bkey_p_copy(out, k); out = bkey_p_next(out); } else { BUG_ON(k->needs_whiteout); @@ -510,16 +510,6 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) bch2_trans_node_reinit_iter(trans, b); } -static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, - struct btree *b) -{ - prt_printf(out, "%s level %u/%u\n ", - bch2_btree_ids[b->c.btree_id], - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); - bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -} - static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, @@ -532,7 +522,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, if (ca) prt_printf(out, "on %s ", ca->name); prt_printf(out, "at btree "); - btree_pos_to_text(out, c, b); + bch2_btree_pos_to_text(out, c, b); prt_printf(out, "\n node offset %u", b->written); if (i) @@ -540,7 +530,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } -__printf(8, 9) +__printf(9, 10) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, @@ -548,6 +538,7 @@ static int __btree_err(int ret, struct bset *i, int write, bool have_retry, + enum bch_sb_error_id err_type, const char *fmt, ...) { struct printbuf out = PRINTBUF; @@ -572,9 +563,15 @@ static int __btree_err(int ret, if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ret = -BCH_ERR_btree_node_read_err_bad_node; + if (ret != -BCH_ERR_btree_node_read_err_fixable) + bch2_sb_error_count(c, err_type); + switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - mustfix_fsck_err(c, "%s", out.buf); + ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); + if (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore) + goto fsck_err; ret = -BCH_ERR_fsck_fix; break; case -BCH_ERR_btree_node_read_err_want_retry: @@ -599,9 +596,11 @@ fsck_err: return ret; } -#define btree_err(type, c, ca, b, i, msg, ...) \ +#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + BCH_FSCK_ERR_##_err_type, \ + msg, ##__VA_ARGS__); \ \ if (_ret != -BCH_ERR_fsck_fix) { \ ret = _ret; \ @@ -676,13 +675,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, int ret = 0; btree_err_on(!bch2_version_compatible(version), - -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, + -BCH_ERR_btree_node_read_err_incompatible, + c, ca, b, i, + btree_node_unsupported_version, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), BCH_VERSION_MINOR(version)); if (btree_err_on(version < c->sb.version_min, - -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { mutex_lock(&c->sb_lock); @@ -693,7 +696,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), - -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { mutex_lock(&c->sb_lock); @@ -703,11 +708,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, + -BCH_ERR_btree_node_read_err_incompatible, + c, ca, b, i, + btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), - -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_past_end_of_btree_node, "bset past end of btree node")) { i->u64s = 0; ret = 0; @@ -715,12 +724,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(offset && !i->u64s, - -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_empty, "empty bset"); - btree_err_on(BSET_OFFSET(i) && - BSET_OFFSET(i) != offset, - -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_wrong_sector_offset, "bset at wrong sector offset"); if (!offset) { @@ -734,16 +746,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + bset_bad_seq, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_btree, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_level, "incorrect level"); if (!write) @@ -760,7 +778,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_min_key, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), @@ -769,7 +789,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_max_key, "incorrect max key %s", (printbuf_reset(&buf1), bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); @@ -779,7 +801,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, bn); btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), - -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i, + -BCH_ERR_btree_node_read_err_bad_node, + c, ca, b, i, + btree_node_bad_format, "invalid bkey format: %s\n %s", buf1.buf, (printbuf_reset(&buf2), bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); @@ -802,7 +826,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, struct printbuf *err) { return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: - (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); } @@ -823,14 +847,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bkey tmp; if (btree_err_on(bkey_p_next(k) > vstruct_last(i), - -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_past_bset_end, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); break; } if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_bad_format, "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), @@ -849,12 +877,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, printbuf_reset(&buf); if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { printbuf_reset(&buf); - prt_printf(&buf, "invalid bkey: "); bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, u.s_c); - btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); + btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bad_bkey, + "invalid bkey: %s", buf.buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), @@ -878,7 +908,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_dump_bset(c, b, i, 0); - if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) { + if (btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_out_of_order, + "%s", buf.buf)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); @@ -919,47 +952,62 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); if (bch2_meta_read_fault("btree")) - btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + btree_err(-BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_fault_injected, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_magic, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); - btree_err_on(!b->data->keys.seq, - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, - "bad btree header: seq 0"); - if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; btree_err_on(b->data->keys.seq != bp->seq, - -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_seq, "got wrong btree node (seq %llx want %llx)", b->data->keys.seq, bp->seq); + } else { + btree_err_on(!b->data->keys.seq, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_seq, + "bad btree header: seq 0"); } while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors; struct nonce nonce; - struct bch_csum csum; bool first = !b->written; + bool csum_bad; if (!b->written) { i = &b->data->keys; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, - "unknown checksum type %llu", - BSET_CSUM_TYPE(i)); + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - btree_err_on(bch2_crc_cmp(csum, b->data->csum), - -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + csum_bad = bch2_crc_cmp(b->data->csum, + csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data)); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_bad_csum, "invalid checksum"); ret = bset_encrypt(c, i, b->written << 9); @@ -969,7 +1017,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_incompatible, + c, NULL, b, NULL, + btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); @@ -981,15 +1031,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, break; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, - "unknown checksum type %llu", - BSET_CSUM_TYPE(i)); + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); nonce = btree_nonce(i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - btree_err_on(bch2_crc_cmp(csum, bne->csum), - -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + csum_bad = bch2_crc_cmp(bne->csum, + csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne)); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_bad_csum, "invalid checksum"); ret = bset_encrypt(c, i, b->written << 9); @@ -1022,12 +1078,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, true); btree_err_on(blacklisted && first, - -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_blacklisted_journal_seq, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, - -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + first_bset_blacklisted_journal_seq, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), b->written, b->written + sectors, ptr_written); @@ -1044,7 +1104,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (ptr_written) { btree_err_on(b->written < ptr_written, - -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, NULL, + btree_node_data_missing, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); } else { @@ -1055,7 +1117,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, !bch2_journal_seq_is_blacklisted(c, le64_to_cpu(bne->keys.journal_seq), true), - -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, NULL, + btree_node_bset_after_end, "found bset signature after last bset"); } @@ -1097,7 +1161,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, u.s_c); - btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); + btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bad_bkey, + "%s", buf.buf); btree_keys_account_key_drop(&b->nr, 0, k); @@ -1177,8 +1244,9 @@ static void btree_node_read_work(struct work_struct *work) } start: printbuf_reset(&buf); - btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", + bch2_btree_pos_to_text(&buf, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "btree read error %s for %s", bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); @@ -1213,7 +1281,7 @@ start: printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->key.k.p); bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", - __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); + __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); bch2_btree_node_rewrite_async(c, b); } @@ -1322,14 +1390,20 @@ static void btree_node_read_all_replicas_done(struct closure *cl) } written2 = btree_node_sectors_written(c, ra->buf[i]); - if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, + if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_replicas_sectors_written_mismatch, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), - -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_bset_after_end, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), - -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_replicas_data_mismatch, "btree node replicas content mismatch")) dump_bset_maps = true; @@ -1524,7 +1598,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, struct printbuf buf = PRINTBUF; prt_str(&buf, "btree node read error: no device to read from\n at "); - btree_pos_to_text(&buf, c, b); + bch2_btree_pos_to_text(&buf, c, b); bch_err(c, "%s", buf.buf); if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && @@ -1759,7 +1833,8 @@ static void btree_node_write_endio(struct bio *bio) if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 1d79514754d7..c2adf3fbb0b3 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && - !btree_type_has_snapshots(iter->btree_id)); + !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) bch2_btree_path_verify(trans, iter->update_path); @@ -362,7 +362,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, bch2_bpos_to_text(&buf, pos); panic("not locked: %s %s%s\n", - bch2_btree_ids[id], buf.buf, + bch2_btree_id_str(id), buf.buf, key_cache ? " cached" : ""); } @@ -1109,6 +1109,9 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, if (unlikely(ret)) goto out; + if (unlikely(!trans->srcu_held)) + bch2_trans_srcu_lock(trans); + /* * Ensure we obey path->should_be_locked: if it's set, we can't unlock * and re-traverse the path without a transaction restart: @@ -1371,7 +1374,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) struct bkey_s_c old = { &i->old_k, i->old_v }; prt_printf(buf, "update: btree=%s cached=%u %pS", - bch2_btree_ids[i->btree_id], + bch2_btree_id_str(i->btree_id), i->cached, (void *) i->ip_allocated); prt_newline(buf); @@ -1387,7 +1390,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) trans_for_each_wb_update(trans, wb) { prt_printf(buf, "update: btree=%s wb=1 %pS", - bch2_btree_ids[wb->btree], + bch2_btree_id_str(wb->btree), (void *) i->ip_allocated); prt_newline(buf); @@ -1416,7 +1419,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) path->idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', - bch2_btree_ids[path->btree_id], + bch2_btree_id_str(path->btree_id), path->level); bch2_bpos_to_text(out, path->pos); @@ -1523,6 +1526,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; + path->alloc_seq++; btree_path_list_add(trans, pos, path); trans->paths_sorted = false; @@ -1598,7 +1602,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, locks_want = min(locks_want, BTREE_MAX_DEPTH); if (locks_want > path->locks_want) - bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); + bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); return path; } @@ -2829,18 +2833,36 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) return p; } -static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) +static inline void check_srcu_held_too_long(struct btree_trans *trans) { - struct bch_fs *c = trans->c; - struct btree_path *path; + WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), + "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", + (jiffies - trans->srcu_lock_time) / HZ); +} - trans_for_each_path(trans, path) - if (path->cached && !btree_node_locked(path, 0)) - path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); +void bch2_trans_srcu_unlock(struct btree_trans *trans) +{ + if (trans->srcu_held) { + struct bch_fs *c = trans->c; + struct btree_path *path; - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; + trans_for_each_path(trans, path) + if (path->cached && !btree_node_locked(path, 0)) + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); + + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + trans->srcu_held = false; + } +} + +void bch2_trans_srcu_lock(struct btree_trans *trans) +{ + if (!trans->srcu_held) { + trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; + } } /** @@ -2894,8 +2916,9 @@ u32 bch2_trans_begin(struct btree_trans *trans) } trans->last_begin_time = now; - if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) - bch2_trans_reset_srcu_lock(trans); + if (unlikely(trans->srcu_held && + time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) + bch2_trans_srcu_unlock(trans); trans->last_begin_ip = _RET_IP_; if (trans->restarted) { @@ -2980,8 +3003,9 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->wb_updates_size = s->wb_updates_size; } - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; + trans->srcu_held = true; if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { struct btree_trans *pos; @@ -3025,7 +3049,7 @@ leaked: trans_for_each_path(trans, path) if (path->ref) printk(KERN_ERR " btree %s %pS\n", - bch2_btree_ids[path->btree_id], + bch2_btree_id_str(path->btree_id), (void *) path->ip_allocated); /* Be noisy about this: */ bch2_fatal_error(c); @@ -3058,7 +3082,10 @@ void bch2_trans_put(struct btree_trans *trans) check_btree_paths_leaked(trans); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + if (trans->srcu_held) { + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + } bch2_journal_preres_put(&c->journal, &trans->journal_preres); @@ -3100,7 +3127,7 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, prt_tab(out); prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', - b->level, bch2_btree_ids[b->btree_id]); + b->level, bch2_btree_id_str(b->btree_id)); bch2_bpos_to_text(out, btree_node_pos(b)); prt_tab(out); @@ -3130,7 +3157,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) path->idx, path->cached ? 'c' : 'b', path->level, - bch2_btree_ids[path->btree_id]); + bch2_btree_id_str(path->btree_id)); bch2_bpos_to_text(out, path->pos); prt_newline(out); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index fbe273453db3..85e7cb52f6b6 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -274,6 +274,7 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool); int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); +void bch2_trans_unlock_long(struct btree_trans *); bool bch2_trans_locked(struct btree_trans *); static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) @@ -411,11 +412,11 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && - btree_node_type_is_extents(btree_id)) + btree_id_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && - !btree_type_has_snapshots(btree_id)) + !btree_type_has_snapshot_field(btree_id)) flags &= ~BTREE_ITER_ALL_SNAPSHOTS; if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && @@ -579,6 +580,9 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ KEY_TYPE_##_type, sizeof(*_val), _val) +void bch2_trans_srcu_unlock(struct btree_trans *); +void bch2_trans_srcu_lock(struct btree_trans *); + u32 bch2_trans_begin(struct btree_trans *); /* diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index f9a5e38a085b..9b78f78a75b5 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -324,7 +324,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_ids[path->btree_id]); + bch2_btree_id_str(path->btree_id)); return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); } @@ -407,7 +407,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[ck->key.btree_id], new_u64s); + bch2_btree_id_str(ck->key.btree_id), new_u64s); ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; goto err; } @@ -509,7 +509,7 @@ fill: * path->uptodate yet: */ if (!path->locks_want && - !__bch2_btree_path_upgrade(trans, path, 1)) { + !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); goto err; @@ -1038,7 +1038,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->table_init_done = true; - shrink = shrinker_alloc(0, "%s/btree_key_cache", c->name); + shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); if (!shrink) return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->shrink = shrink; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 40c8ed8f7bf1..3d48834d091f 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -431,7 +431,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, static inline bool btree_path_get_locks(struct btree_trans *trans, struct btree_path *path, - bool upgrade) + bool upgrade, + struct get_locks_fail *f) { unsigned l = path->level; int fail_idx = -1; @@ -442,8 +443,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (!(upgrade ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - fail_idx = l; + : bch2_btree_node_relock(trans, path, l))) { + fail_idx = l; + + if (f) { + f->l = l; + f->b = path->l[l].b; + } + } l++; } while (l < path->locks_want); @@ -584,7 +591,9 @@ __flatten bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - return btree_path_get_locks(trans, path, false); + struct get_locks_fail f; + + return btree_path_get_locks(trans, path, false, &f); } int __bch2_btree_path_relock(struct btree_trans *trans, @@ -600,22 +609,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans, bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, struct btree_path *path, - unsigned new_locks_want) + unsigned new_locks_want, + struct get_locks_fail *f) { EBUG_ON(path->locks_want >= new_locks_want); path->locks_want = new_locks_want; - return btree_path_get_locks(trans, path, true); + return btree_path_get_locks(trans, path, true, f); } bool __bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, - unsigned new_locks_want) + unsigned new_locks_want, + struct get_locks_fail *f) { struct btree_path *linked; - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) + if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) return true; /* @@ -644,7 +655,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true); + btree_path_get_locks(trans, linked, true, NULL); } return false; @@ -656,6 +667,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, { unsigned l; + if (trans->restarted) + return; + EBUG_ON(path->locks_want < new_locks_want); path->locks_want = new_locks_want; @@ -674,6 +688,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, } bch2_btree_path_verify_locks(path); + + path->downgrade_seq++; + trace_path_downgrade(trans, _RET_IP_, path); } /* Btree transaction locking: */ @@ -682,6 +699,9 @@ void bch2_trans_downgrade(struct btree_trans *trans) { struct btree_path *path; + if (trans->restarted) + return; + trans_for_each_path(trans, path) bch2_btree_path_downgrade(trans, path); } @@ -733,6 +753,12 @@ void bch2_trans_unlock(struct btree_trans *trans) __bch2_btree_path_unlock(trans, path); } +void bch2_trans_unlock_long(struct btree_trans *trans) +{ + bch2_trans_unlock(trans); + bch2_trans_srcu_unlock(trans); +} + bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 6231e9ffc5d7..11b0a2c8cd69 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -355,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ + +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, - struct btree_path *, unsigned); + struct btree_path *, unsigned, + struct get_locks_fail *); + bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned); + struct btree_path *, unsigned, + struct get_locks_fail *); static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { + struct get_locks_fail f; unsigned old_locks_want = path->locks_want; new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); if (path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) : path->uptodate == BTREE_ITER_UPTODATE) return 0; trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, - old_locks_want, new_locks_want); + old_locks_want, new_locks_want, &f); return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); } diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 04c1f4610972..decad7b66c59 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -269,6 +269,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->level != i->path->level); BUG_ON(i->btree_id != i->path->btree_id); EBUG_ON(!i->level && + btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && i->k->k.p.snapshot && @@ -349,7 +350,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); if (!new_k) { bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[path->btree_id], new_u64s); + bch2_btree_id_str(path->btree_id), new_u64s); return -BCH_ERR_ENOMEM_btree_key_cache_insert; } @@ -379,11 +380,10 @@ static int run_one_mem_trigger(struct btree_trans *trans, if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) + if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) return 0; - if (old_ops->atomic_trigger == new_ops->atomic_trigger && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + if (old_ops->atomic_trigger == new_ops->atomic_trigger) { ret = bch2_mark_key(trans, i->btree_id, i->level, old, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); @@ -425,8 +425,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ if (!i->insert_trigger_run && !i->overwrite_trigger_run && - old_ops->trans_trigger == new_ops->trans_trigger && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + old_ops->trans_trigger == new_ops->trans_trigger) { i->overwrite_trigger_run = true; i->insert_trigger_run = true; return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, @@ -683,7 +682,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, BCH_JSET_ENTRY_overwrite, i->btree_id, i->level, i->old_k.u64s); - bkey_reassemble(&entry->start[0], + bkey_reassemble((struct bkey_i *) entry->start, (struct bkey_s_c) { &i->old_k, i->old_v }); } @@ -691,7 +690,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, BCH_JSET_ENTRY_btree_keys, i->btree_id, i->level, i->k->k.u64s); - bkey_copy(&entry->start[0], i->k); + bkey_copy((struct bkey_i *) entry->start, i->k); } trans_for_each_wb_update(trans, wb) { @@ -699,7 +698,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, BCH_JSET_ENTRY_btree_keys, wb->btree, 0, wb->k.k.u64s); - bkey_copy(&entry->start[0], &wb->k); + bkey_copy((struct bkey_i *) entry->start, &wb->k); } if (trans->journal_seq) @@ -776,12 +775,12 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); } -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, + enum bkey_invalid_flags flags, struct btree_insert_entry *i, struct printbuf *err) { struct bch_fs *c = trans->c; - int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; printbuf_reset(err); prt_printf(err, "invalid bkey on insert from %s -> %ps", @@ -792,8 +791,7 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); prt_newline(err); - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, rw, err); + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); bch2_print_string_as_lines(KERN_ERR, err->buf); bch2_inconsistent_error(c); @@ -864,12 +862,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags */ bch2_journal_res_put(&c->journal, &trans->journal_res); - if (unlikely(ret)) - return ret; - - bch2_trans_downgrade(trans); - - return 0; + return ret; } static int journal_reclaim_wait_done(struct bch_fs *c) @@ -1034,7 +1027,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); + ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); btree_insert_entry_checks(trans, i); printbuf_exit(&buf); @@ -1138,6 +1131,8 @@ out: if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: + if (!ret) + bch2_trans_downgrade(trans); bch2_trans_reset_updates(trans); return ret; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index bc6714d88925..941841a0c5bf 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -228,6 +228,8 @@ struct btree_path { u8 sorted_idx; u8 ref; u8 intent_ref; + u32 alloc_seq; + u32 downgrade_seq; /* btree_iter_copy starts here: */ struct bpos pos; @@ -424,6 +426,7 @@ struct btree_trans { u8 nr_updates; u8 nr_wb_updates; u8 wb_updates_size; + bool srcu_held:1; bool used_mempool:1; bool in_traverse_all:1; bool paths_sorted:1; @@ -636,16 +639,17 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i) } enum btree_node_type { -#define x(kwd, val, ...) BKEY_TYPE_##kwd = val, + BKEY_TYPE_btree, +#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1, BCH_BTREE_IDS() #undef x - BKEY_TYPE_btree, + BKEY_TYPE_NR }; /* Type of a key in btree @id at level @level: */ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) { - return level ? BKEY_TYPE_btree : (enum btree_node_type) id; + return level ? BKEY_TYPE_btree : (unsigned) id + 1; } /* Type of keys @b contains: */ @@ -654,19 +658,21 @@ static inline enum btree_node_type btree_node_type(struct btree *b) return __btree_node_type(b->c.level, b->c.btree_id); } +const char *bch2_btree_node_type_str(enum btree_node_type); + #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - (BIT(BKEY_TYPE_extents)| \ - BIT(BKEY_TYPE_alloc)| \ - BIT(BKEY_TYPE_inodes)| \ - BIT(BKEY_TYPE_stripes)| \ - BIT(BKEY_TYPE_reflink)| \ - BIT(BKEY_TYPE_btree)) + (BIT_ULL(BKEY_TYPE_extents)| \ + BIT_ULL(BKEY_TYPE_alloc)| \ + BIT_ULL(BKEY_TYPE_inodes)| \ + BIT_ULL(BKEY_TYPE_stripes)| \ + BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ - (BIT(BKEY_TYPE_alloc)| \ - BIT(BKEY_TYPE_inodes)| \ - BIT(BKEY_TYPE_stripes)| \ - BIT(BKEY_TYPE_snapshots)) + (BIT_ULL(BKEY_TYPE_alloc)| \ + BIT_ULL(BKEY_TYPE_inodes)| \ + BIT_ULL(BKEY_TYPE_stripes)| \ + BIT_ULL(BKEY_TYPE_snapshots)) #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ @@ -674,13 +680,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { - return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); + return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type); } static inline bool btree_node_type_is_extents(enum btree_node_type type) { const unsigned mask = 0 -#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr) +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) BCH_BTREE_IDS() #undef x ; @@ -690,7 +696,7 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type) static inline bool btree_id_is_extents(enum btree_id btree) { - return btree_node_type_is_extents((enum btree_node_type) btree); + return btree_node_type_is_extents(__btree_node_type(0, btree)); } static inline bool btree_type_has_snapshots(enum btree_id id) @@ -704,6 +710,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id) return (1U << id) & mask; } +static inline bool btree_type_has_snapshot_field(enum btree_id id) +{ + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return (1U << id) & mask; +} + static inline bool btree_type_has_ptrs(enum btree_id id) { const unsigned mask = 0 diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 7dbf6b6c7f34..39c2db68123b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1274,14 +1274,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), WRITE, &buf) ?: - bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { printbuf_reset(&buf); prt_printf(&buf, "inserting invalid bkey\n "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); prt_printf(&buf, "\n "); bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b), WRITE, &buf); - bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); bch2_fs_inconsistent(c, "%s", buf.buf); dump_stack(); @@ -1987,7 +1987,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, out: if (new_path) bch2_path_put(trans, new_path, true); - bch2_btree_path_downgrade(trans, iter->path); + bch2_trans_downgrade(trans); return ret; err: bch2_btree_node_free_never_used(as, trans, n); @@ -2411,30 +2411,24 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry r->level = entry->level; r->alive = true; - bkey_copy(&r->key, &entry->start[0]); + bkey_copy(&r->key, (struct bkey_i *) entry->start); mutex_unlock(&c->btree_root_lock); } struct jset_entry * bch2_btree_roots_to_journal_entries(struct bch_fs *c, - struct jset_entry *start, - struct jset_entry *end) + struct jset_entry *end, + unsigned long skip) { - struct jset_entry *entry; - unsigned long have = 0; unsigned i; - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root) - __set_bit(entry->btree_id, &have); - mutex_lock(&c->btree_root_lock); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); - if (r->alive && !test_bit(i, &have)) { + if (r->alive && !test_bit(i, &skip)) { journal_entry_set(end, BCH_JSET_ENTRY_btree_root, i, r->level, &r->key, r->key.k.u64s); end = vstruct_next(end); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 5e0a467fe905..4df21512d640 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -271,7 +271,7 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree_node_entry *bne = max(write_block(b), (void *) btree_bkey_last(b, bset_tree_last(b))); ssize_t remaining_space = - __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); + __bch_btree_u64s_remaining(c, b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) @@ -303,7 +303,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, k.needs_whiteout = true; b->whiteout_u64s += k.u64s; - bkey_copy(unwritten_whiteouts_start(c, b), &k); + bkey_p_copy(unwritten_whiteouts_start(c, b), &k); } /* @@ -325,7 +325,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *); void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, - struct jset_entry *, struct jset_entry *); + struct jset_entry *, unsigned long); void bch2_do_pending_node_rewrites(struct bch_fs *); void bch2_free_pending_node_rewrites(struct bch_fs *); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index a1a4b5feadaa..58d8c6ffd955 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -370,8 +370,8 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, idx = bch2_replicas_entry_idx(c, r); if (idx < 0 && - fsck_err(c, "no replicas entry\n" - " while marking %s", + fsck_err(c, ptr_to_missing_replicas_entry, + "no replicas entry\n while marking %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { percpu_up_read(&c->mark_lock); ret = bch2_mark_replicas(c, r); @@ -695,6 +695,7 @@ static int check_bucket_ref(struct btree_trans *trans, if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -707,6 +708,7 @@ static int check_bucket_ref(struct btree_trans *trans, if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_too_stale, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -720,6 +722,7 @@ static int check_bucket_ref(struct btree_trans *trans, if (b_gen != ptr->gen && !ptr->cached) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -741,6 +744,7 @@ static int check_bucket_ref(struct btree_trans *trans, ptr_data_type && bucket_data_type != ptr_data_type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -754,6 +758,7 @@ static int check_bucket_ref(struct btree_trans *trans, if ((u64) bucket_sectors + sectors > U32_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -935,14 +940,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, return 0; } -int bch2_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +static int __mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1018,6 +1021,14 @@ int bch2_mark_extent(struct btree_trans *trans, return 0; } +int bch2_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); +} + int bch2_mark_stripe(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -1124,13 +1135,11 @@ int bch2_mark_stripe(struct btree_trans *trans, return 0; } -int bch2_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +static int __mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bch_fs_usage *fs_usage; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; @@ -1157,6 +1166,14 @@ int bch2_mark_reservation(struct btree_trans *trans, return 0; } +int bch2_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); +} + static s64 __bch2_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 start, u64 end, @@ -1183,7 +1200,8 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans, *idx = r->offset; return 0; not_found: - if (fsck_err(c, "pointer to missing indirect extent\n" + if (fsck_err(c, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" " %s\n" " missing range %llu-%llu", (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), @@ -1211,13 +1229,11 @@ fsck_err: return ret; } -int bch2_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) +static int __mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); struct reflink_gc *ref; size_t l, r, m; @@ -1251,6 +1267,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans, return ret; } +int bch2_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); +} + void bch2_trans_fs_usage_revert(struct btree_trans *trans, struct replicas_delta_list *deltas) { @@ -1298,7 +1322,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, struct bch_fs *c = trans->c; static int warned_disk_usage = 0; bool warn = false; - unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; struct replicas_delta *d, *d2; struct replicas_delta *top = (void *) deltas->d + deltas->used; struct bch_fs_usage *dst; @@ -1357,7 +1381,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) bch2_trans_inconsistent(trans, - "disk usage increased %lli more than %u sectors reserved)", + "disk usage increased %lli more than %llu sectors reserved)", should_not_have_added, disk_res_sectors); return 0; need_mark: @@ -1452,15 +1476,11 @@ err: return ret; } -int bch2_trans_mark_extent(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, struct bkey_i *new, - unsigned flags) +static int __trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { struct bch_fs *c = trans->c; - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE - ? old - : bkey_i_to_s_c(new); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1517,6 +1537,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans, return ret; } +int bch2_trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - + (int) bch2_bkey_needs_rebalance(c, old); + + if (mod) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); + if (ret) + return ret; + } + + return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); +} + static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, struct bkey_s_c_stripe s, unsigned idx, bool deleting) @@ -1670,15 +1708,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans, return ret; } -int bch2_trans_mark_reservation(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) +static int __trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE - ? old - : bkey_i_to_s_c(new); unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; struct replicas_delta_list *d; @@ -1700,7 +1733,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans, return 0; } -static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, +int bch2_trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); +} + +static int trans_mark_reflink_p_segment(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 *idx, unsigned flags) { @@ -1767,35 +1809,38 @@ err: return ret; } -int bch2_trans_mark_reflink_p(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) +static int __trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE - ? old - : bkey_i_to_s_c(new); struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); u64 idx, end_idx; int ret = 0; - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - - v->front_pad = v->back_pad = 0; - } - idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); end_idx = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); while (idx < end_idx && !ret) - ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); - + ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); return ret; } +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; + + v->front_pad = v->back_pad = 0; + } + + return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); +} + static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, @@ -1818,6 +1863,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (a->v.data_type && type && a->v.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, @@ -1825,16 +1871,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_types[type], bch2_data_types[type]); ret = -EIO; - goto out; + goto err; } - a->v.data_type = type; - a->v.dirty_sectors = sectors; - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - if (ret) - goto out; -out: + if (a->v.data_type != type || + a->v.dirty_sectors != sectors) { + a->v.data_type = type; + a->v.dirty_sectors = sectors; + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + } +err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1929,6 +1975,22 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) return ret; } +int bch2_trans_mark_dev_sbs(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) { + int ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + percpu_ref_put(&ca->ref); + return ret; + } + } + + return 0; +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index bf8d7f407e9c..21f6cb356921 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -339,12 +339,27 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ +({ \ + int ret = 0; \ + \ + if (_old.k->type) \ + ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ + if (!ret && _new.k->type) \ + ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ + ret; \ +}) + +#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ + mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) + void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, size_t, enum bch_data_type, unsigned); int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +int bch2_trans_mark_dev_sbs(struct bch_fs *); static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) { diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index f69e15dc699c..4bb88aefed12 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -332,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_ioctl_data_event e = { .type = BCH_DATA_EVENT_PROGRESS, .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.btree_id, - .p.pos = ctx->stats.pos, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), .p.sectors_total = bch2_fs_usage_read_short(c).used, }; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 1480b64547b0..a8b148ec2a2b 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -697,14 +697,32 @@ err: return ret; } +void bch2_compression_opt_to_text(struct printbuf *out, u64 v) +{ + struct bch_compression_opt opt = bch2_compression_decode(v); + + if (opt.type < BCH_COMPRESSION_OPT_NR) + prt_str(out, bch2_compression_opts[opt.type]); + else + prt_printf(out, "(unknown compression opt %u)", opt.type); + if (opt.level) + prt_printf(out, ":%u", opt.level); +} + void bch2_opt_compression_to_text(struct printbuf *out, struct bch_fs *c, struct bch_sb *sb, u64 v) { - struct bch_compression_opt opt = bch2_compression_decode(v); + return bch2_compression_opt_to_text(out, v); +} - prt_str(out, bch2_compression_opts[opt.type]); - if (opt.level) - prt_printf(out, ":%u", opt.level); +int bch2_opt_compression_validate(u64 v, struct printbuf *err) +{ + if (!bch2_compression_opt_valid(v)) { + prt_printf(err, "invalid compression opt %llu", v); + return -BCH_ERR_invalid_sb_opt_compression; + } + + return 0; } diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h index 052ea303241f..607fd5e232c9 100644 --- a/fs/bcachefs/compress.h +++ b/fs/bcachefs/compress.h @@ -4,12 +4,18 @@ #include "extents_types.h" +static const unsigned __bch2_compression_opt_to_type[] = { +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, + BCH_COMPRESSION_OPTS() +#undef x +}; + struct bch_compression_opt { u8 type:4, level:4; }; -static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) { return (struct bch_compression_opt) { .type = v & 15, @@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v) }; } +static inline bool bch2_compression_opt_valid(unsigned v) +{ + struct bch_compression_opt opt = __bch2_compression_decode(v); + + return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); +} + +static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +{ + return bch2_compression_opt_valid(v) + ? __bch2_compression_decode(v) + : (struct bch_compression_opt) { 0 }; +} + static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) { return opt.type|(opt.level << 4); } -static const unsigned __bch2_compression_opt_to_type[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) { return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; @@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); void bch2_fs_compress_exit(struct bch_fs *); int bch2_fs_compress_init(struct bch_fs *); +void bch2_compression_opt_to_text(struct printbuf *, u64); + int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); +int bch2_opt_compression_validate(u64, struct printbuf *); #define bch2_opt_compression (struct bch_opt_fn) { \ - .parse = bch2_opt_compression_parse, \ - .to_text = bch2_opt_compression_to_text, \ + .parse = bch2_opt_compression_parse, \ + .to_text = bch2_opt_compression_to_text, \ + .validate = bch2_opt_compression_validate, \ } #endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 114f86b45fd5..87b4b2d1ec76 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -69,9 +69,15 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, _ret; \ }) +#define darray_remove_item(_d, _pos) \ + array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) + #define darray_for_each(_d, _i) \ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) +#define darray_for_each_reverse(_d, _i) \ + for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + #define darray_init(_d) \ do { \ (_d)->data = NULL; \ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 899ff46de8e0..0771a6d880bf 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -13,6 +13,7 @@ #include "keylist.h" #include "move.h" #include "nocow_locking.h" +#include "rebalance.h" #include "subvolume.h" #include "trace.h" @@ -161,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); - /* - * See comment below: bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); - */ rewrites_found |= 1U << i; } i++; @@ -211,14 +208,8 @@ restart_drop_extra_replicas: if (!p.ptr.cached && durability - ptr_durability >= m->op.opts.data_replicas) { durability -= ptr_durability; - bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr); - /* - * Currently, we're dropping unneeded replicas - * instead of marking them as cached, since - * cached data in stripe buckets prevents them - * from being reused: + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); - */ goto restart_drop_extra_replicas; } } @@ -251,11 +242,11 @@ restart_drop_extra_replicas: ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p); - if (ret) - goto err; - - ret = bch2_trans_update(trans, &iter, insert, + k.k->p, insert->k.p) ?: + bch2_bkey_set_needs_rebalance(c, insert, + op->opts.background_target, + op->opts.background_compression) ?: + bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, NULL, @@ -281,11 +272,11 @@ next: } continue; nowork: - if (m->ctxt && m->ctxt->stats) { + if (m->stats && m->stats) { BUG_ON(k.k->p.offset <= iter.pos.offset); - atomic64_inc(&m->ctxt->stats->keys_raced); + atomic64_inc(&m->stats->keys_raced); atomic64_add(k.k->p.offset - iter.pos.offset, - &m->ctxt->stats->sectors_raced); + &m->stats->sectors_raced); } this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); @@ -439,6 +430,8 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; m->data_opts = data_opts; + m->ctxt = ctxt; + m->stats = ctxt ? ctxt->stats : NULL; bch2_write_op_init(&m->op, c, io_opts); m->op.pos = bkey_start_pos(k.k); @@ -487,7 +480,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (c->opts.nocow_enabled) { if (ctxt) { - move_ctxt_wait_event(ctxt, trans, + move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || !atomic_read(&ctxt->read_sectors)); diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 7ca1f98d7e94..9dc17b9d8379 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -23,6 +23,7 @@ struct data_update { struct bkey_buf k; struct data_update_opts data_opts; struct moving_context *ctxt; + struct bch_move_stats *stats; struct bch_write_op op; }; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 75a3dc7cbd47..57c5128db173 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -517,7 +517,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * prt_printf(out, "%px btree=%s l=%u ", b, - bch2_btree_ids[b->c.btree_id], + bch2_btree_id_str(b->c.btree_id), b->c.level); prt_newline(out); @@ -919,18 +919,18 @@ void bch2_fs_debug_init(struct bch_fs *c) bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; - debugfs_create_file(bch2_btree_ids[bd->id], + debugfs_create_file(bch2_btree_id_str(bd->id), 0400, c->btree_debug_dir, bd, &btree_debug_ops); snprintf(name, sizeof(name), "%s-formats", - bch2_btree_ids[bd->id]); + bch2_btree_id_str(bd->id)); debugfs_create_file(name, 0400, c->btree_debug_dir, bd, &btree_format_debug_ops); snprintf(name, sizeof(name), "%s-bfloat-failed", - bch2_btree_ids[bd->id]); + bch2_btree_id_str(bd->id)); debugfs_create_file(name, 0400, c->btree_debug_dir, bd, &bfloat_failed_debug_ops); diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 6c6c8d57d72b..1a0f2d571569 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -97,61 +97,51 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .is_visible = dirent_is_visible, }; -int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); + int ret = 0; - if (!d_name.len) { - prt_printf(err, "empty name"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(!d_name.len, c, err, + dirent_empty_name, + "empty name"); - if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) { - prt_printf(err, "value too big (%zu > %u)", - bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, + dirent_val_too_big, + "value too big (%zu > %u)", + bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); /* * Check new keys don't exceed the max length * (older keys may be larger.) */ - if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) { - prt_printf(err, "dirent name too big (%u > %u)", - d_name.len, BCH_NAME_MAX); - return -BCH_ERR_invalid_bkey; - } - - if (d_name.len != strnlen(d_name.name, d_name.len)) { - prt_printf(err, "dirent has stray data after name's NUL"); - return -BCH_ERR_invalid_bkey; - } - - if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) { - prt_printf(err, "invalid name"); - return -BCH_ERR_invalid_bkey; - } - - if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) { - prt_printf(err, "invalid name"); - return -BCH_ERR_invalid_bkey; - } - - if (memchr(d_name.name, '/', d_name.len)) { - prt_printf(err, "invalid name"); - return -BCH_ERR_invalid_bkey; - } - - if (d.v->d_type != DT_SUBVOL && - le64_to_cpu(d.v->d_inum) == d.k->p.inode) { - prt_printf(err, "dirent points to own directory"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, + dirent_name_too_long, + "dirent name too big (%u > %u)", + d_name.len, BCH_NAME_MAX); + + bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, + dirent_name_embedded_nul, + "dirent has stray data after name's NUL"); + + bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || + (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, + dirent_name_dot_or_dotdot, + "invalid name"); + + bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, + dirent_name_has_slash, + "name with /"); + + bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && + le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, + dirent_to_itself, + "dirent points to own directory"); +fsck_err: + return ret; } void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index e9fa1df38232..cd262bf4d9c5 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -7,7 +7,7 @@ enum bkey_invalid_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index e00133b6ea51..d613695abf9f 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -175,6 +175,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) dst->deleted = BCH_GROUP_DELETED(src); dst->parent = BCH_GROUP_PARENT(src); + memcpy(dst->label, src->label, sizeof(dst->label)); } for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { @@ -382,7 +383,57 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + struct bch_disk_groups_cpu *groups; + struct bch_disk_group_cpu *g; + unsigned nr = 0; + u16 path[32]; + + out->atomic++; + rcu_read_lock(); + groups = rcu_dereference(c->disk_groups); + if (!groups) + goto invalid; + + while (1) { + if (nr == ARRAY_SIZE(path)) + goto invalid; + + if (v >= groups->nr) + goto invalid; + + g = groups->entries + v; + + if (g->deleted) + goto invalid; + + path[nr++] = v; + + if (!g->parent) + break; + + v = g->parent - 1; + } + + while (nr) { + v = path[--nr]; + g = groups->entries + v; + + prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + if (nr) + prt_printf(out, "."); + } +out: + rcu_read_unlock(); + out->atomic--; + return; +invalid: + prt_printf(out, "invalid label %u", v); + goto out; +} + +void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct bch_sb_field_disk_groups *groups = bch2_sb_field_get(sb, disk_groups); @@ -493,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, return -EINVAL; } -void bch2_opt_target_to_text(struct printbuf *out, - struct bch_fs *c, - struct bch_sb *sb, - u64 v) +void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) { struct target t = target_decode(v); @@ -504,47 +552,69 @@ void bch2_opt_target_to_text(struct printbuf *out, case TARGET_NULL: prt_printf(out, "none"); break; - case TARGET_DEV: - if (c) { - struct bch_dev *ca; - - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices - ? rcu_dereference(c->devs[t.dev]) - : NULL; - - if (ca && percpu_ref_tryget(&ca->io_ref)) { - prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); - percpu_ref_put(&ca->io_ref); - } else if (ca) { - prt_printf(out, "offline device %u", t.dev); - } else { - prt_printf(out, "invalid device %u", t.dev); - } - - rcu_read_unlock(); + case TARGET_DEV: { + struct bch_dev *ca; + + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + prt_printf(out, "offline device %u", t.dev); } else { - struct bch_member m = bch2_sb_member_get(sb, t.dev); - - if (bch2_dev_exists(sb, t.dev)) { - prt_printf(out, "Device "); - pr_uuid(out, m.uuid.b); - prt_printf(out, " (%u)", t.dev); - } else { - prt_printf(out, "Bad device %u", t.dev); - } + prt_printf(out, "invalid device %u", t.dev); } + + rcu_read_unlock(); break; + } case TARGET_GROUP: - if (c) { - mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); - mutex_unlock(&c->sb_lock); + bch2_disk_path_to_text(out, c, t.group); + break; + default: + BUG(); + } +} + +void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +{ + struct target t = target_decode(v); + + switch (t.type) { + case TARGET_NULL: + prt_printf(out, "none"); + break; + case TARGET_DEV: { + struct bch_member m = bch2_sb_member_get(sb, t.dev); + + if (bch2_dev_exists(sb, t.dev)) { + prt_printf(out, "Device "); + pr_uuid(out, m.uuid.b); + prt_printf(out, " (%u)", t.dev); } else { - bch2_disk_path_to_text(out, sb, t.group); + prt_printf(out, "Bad device %u", t.dev); } break; + } + case TARGET_GROUP: + bch2_disk_path_to_text_sb(out, sb, t.group); + break; default: BUG(); } } + +void bch2_opt_target_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + if (c) + bch2_target_to_text(out, c, v); + else + bch2_target_to_text_sb(out, sb, v); +} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h index bd7711767fd4..441826fff224 100644 --- a/fs/bcachefs/disk_groups.h +++ b/fs/bcachefs/disk_groups.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_DISK_GROUPS_H #define _BCACHEFS_DISK_GROUPS_H +#include "disk_groups_types.h" + extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) @@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *); /* Exported for userspace bcachefs-tools: */ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); +void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); +void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); + +void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h new file mode 100644 index 000000000000..a54ef085b13d --- /dev/null +++ b/fs/bcachefs/disk_groups_types.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H +#define _BCACHEFS_DISK_GROUPS_TYPES_H + +struct bch_disk_group_cpu { + bool deleted; + u16 parent; + u8 label[BCH_SB_LABEL_SIZE]; + struct bch_devs_mask devs; +}; + +struct bch_disk_groups_cpu { + struct rcu_head rcu; + unsigned nr; + struct bch_disk_group_cpu entries[] __counted_by(nr); +}; + +#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 8646856e4539..875f7c5a6fca 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -105,29 +105,26 @@ struct ec_bio { /* Stripes btree keys: */ -int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + int ret = 0; - if (bkey_eq(k.k->p, POS_MIN)) { - prt_printf(err, "stripe at POS_MIN"); - return -BCH_ERR_invalid_bkey; - } - - if (k.k->p.inode) { - prt_printf(err, "nonzero inode field"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || + bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, + stripe_pos_bad, + "stripe at bad pos"); - if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { - prt_printf(err, "incorrect value size (%zu < %u)", - bkey_val_u64s(k.k), stripe_val_u64s(s)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, + stripe_val_size_bad, + "incorrect value size (%zu < %u)", + bkey_val_u64s(k.k), stripe_val_u64s(s)); - return bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -153,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); if (i < nr_data) prt_printf(out, "#%u", stripe_blockcount_get(s, i)); + prt_printf(out, " gen %u", ptr->gen); if (ptr_stale(ca, ptr)) prt_printf(out, " stale"); } @@ -306,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) struct bch_csum got = ec_block_checksum(buf, i, offset); if (bch2_crc_cmp(want, got)) { - struct printbuf buf2 = PRINTBUF; + struct printbuf err = PRINTBUF; + struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); + + prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", + want.hi, want.lo, + got.hi, got.lo, + bch2_csum_types[v->csum_type]); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); + printbuf_exit(&err); - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key)); - - bch_err_ratelimited(c, - "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", - (void *) _RET_IP_, i, j, v->csum_type, - want.lo, got.lo, buf2.buf); - printbuf_exit(&buf2); clear_bit(i, buf->valid); + + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); break; } @@ -373,7 +376,11 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, + bio_data_dir(bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "erasure coding %s error: %s", bio_data_dir(bio) ? "write" : "read", bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); @@ -474,14 +481,10 @@ err: return ret; } -static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) -{ - return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe)); -} - /* recovery read path: */ -int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) { + struct bch_fs *c = trans->c; struct ec_stripe_buf *buf; struct closure cl; struct bch_stripe *v; @@ -496,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) if (!buf) return -BCH_ERR_ENOMEM_ec_read_extent; - ret = get_stripe_key(c, rbio->pick.ec.idx, buf); + ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { bch_err_ratelimited(c, "error doing reconstruct read: error %i looking up stripe", ret); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 966d165a3b66..7d0237c9819f 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -8,7 +8,7 @@ enum bkey_invalid_flags; -int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -199,7 +199,7 @@ struct ec_stripe_head { struct ec_stripe_new *s; }; -int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); +int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 7cc083776a2e..68a1a96bb7ca 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -3,6 +3,8 @@ #define _BCACHEFS_ERRCODE_H #define BCH_ERRCODES() \ + x(ERANGE, ERANGE_option_too_small) \ + x(ERANGE, ERANGE_option_too_big) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ @@ -213,6 +215,8 @@ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ x(BCH_ERR_invalid_sb, invalid_sb_clean) \ x(BCH_ERR_invalid_sb, invalid_sb_quota) \ + x(BCH_ERR_invalid_sb, invalid_sb_errors) \ + x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 2a5af8872613..7b28d37922fd 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -56,8 +56,9 @@ void bch2_io_error_work(struct work_struct *work) up_write(&c->state_lock); } -void bch2_io_error(struct bch_dev *ca) +void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { + atomic64_inc(&ca->errors[type]); //queue_work(system_long_wq, &ca->io_error_work); } @@ -116,31 +117,34 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) return NULL; - list_for_each_entry(s, &c->fsck_errors, list) + list_for_each_entry(s, &c->fsck_error_msgs, list) if (s->fmt == fmt) { /* * move it to the head of the list: repeated fsck errors * are common */ - list_move(&s->list, &c->fsck_errors); + list_move(&s->list, &c->fsck_error_msgs); return s; } s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) { - if (!c->fsck_alloc_err) + if (!c->fsck_alloc_msgs_err) bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); - c->fsck_alloc_err = true; + c->fsck_alloc_msgs_err = true; return NULL; } INIT_LIST_HEAD(&s->list); s->fmt = fmt; - list_add(&s->list, &c->fsck_errors); + list_add(&s->list, &c->fsck_error_msgs); return s; } -int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) +int bch2_fsck_err(struct bch_fs *c, + enum bch_fsck_flags flags, + enum bch_sb_error_id err, + const char *fmt, ...) { struct fsck_err_state *s = NULL; va_list args; @@ -148,11 +152,13 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; + bch2_sb_error_count(c, err); + va_start(args, fmt); prt_vprintf(out, fmt, args); va_end(args); - mutex_lock(&c->fsck_error_lock); + mutex_lock(&c->fsck_error_msgs_lock); s = fsck_err_get(c, fmt); if (s) { /* @@ -162,7 +168,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) */ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; - mutex_unlock(&c->fsck_error_lock); + mutex_unlock(&c->fsck_error_msgs_lock); printbuf_exit(&buf); return ret; } @@ -257,7 +263,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) if (s) s->ret = ret; - mutex_unlock(&c->fsck_error_lock); + mutex_unlock(&c->fsck_error_msgs_lock); printbuf_exit(&buf); @@ -278,9 +284,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c) { struct fsck_err_state *s, *n; - mutex_lock(&c->fsck_error_lock); + mutex_lock(&c->fsck_error_msgs_lock); - list_for_each_entry_safe(s, n, &c->fsck_errors, list) { + list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { if (s->ratelimited && s->last_msg) bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); @@ -289,5 +295,5 @@ void bch2_flush_fsck_errs(struct bch_fs *c) kfree(s); } - mutex_unlock(&c->fsck_error_lock); + mutex_unlock(&c->fsck_error_msgs_lock); } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 7ce9540052e5..d167d65986e0 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -4,6 +4,7 @@ #include <linux/list.h> #include <linux/printk.h> +#include "sb-errors.h" struct bch_dev; struct bch_fs; @@ -101,18 +102,26 @@ struct fsck_err_state { char *last_msg; }; -#define FSCK_CAN_FIX (1 << 0) -#define FSCK_CAN_IGNORE (1 << 1) -#define FSCK_NEED_FSCK (1 << 2) -#define FSCK_NO_RATELIMIT (1 << 3) +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, +}; + +#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) -__printf(3, 4) __cold -int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); +__printf(4, 5) __cold +int bch2_fsck_err(struct bch_fs *, + enum bch_fsck_flags, + enum bch_sb_error_id, + const char *, ...); void bch2_flush_fsck_errs(struct bch_fs *); -#define __fsck_err(c, _flags, msg, ...) \ +#define __fsck_err(c, _flags, _err_type, ...) \ ({ \ - int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ + int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \ + __VA_ARGS__); \ \ if (_ret != -BCH_ERR_fsck_fix && \ _ret != -BCH_ERR_fsck_ignore) { \ @@ -127,26 +136,53 @@ void bch2_flush_fsck_errs(struct bch_fs *); /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ -#define __fsck_err_on(cond, c, _flags, ...) \ - (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) +#define __fsck_err_on(cond, c, _flags, _err_type, ...) \ + (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false) + +#define need_fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + +#define need_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + +#define mustfix_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) + +#define mustfix_fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) -#define need_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) +#define fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -#define need_fsck_err(c, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) +#define fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -#define mustfix_fsck_err(c, ...) \ - __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) +static inline void bch2_bkey_fsck_err(struct bch_fs *c, + struct printbuf *err_msg, + enum bch_sb_error_id err_type, + const char *fmt, ...) +{ + va_list args; -#define mustfix_fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) + va_start(args, fmt); + prt_vprintf(err_msg, fmt, args); + va_end(args); -#define fsck_err(c, ...) \ - __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) +} -#define fsck_err_on(cond, c, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) +#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ +do { \ + prt_printf(_err_msg, __VA_ARGS__); \ + bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ + ret = -BCH_ERR_invalid_bkey; \ + goto fsck_err; \ +} while (0) + +#define bkey_fsck_err_on(cond, ...) \ +do { \ + if (unlikely(cond)) \ + bkey_fsck_err(__VA_ARGS__); \ +} while (0) /* * Fatal errors: these don't indicate a bug, but we can't continue running in RW @@ -179,26 +215,26 @@ do { \ void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ -void bch2_io_error(struct bch_dev *); +void bch2_io_error(struct bch_dev *, enum bch_member_error_type); -#define bch2_dev_io_err_on(cond, ca, ...) \ +#define bch2_dev_io_err_on(cond, ca, _type, ...) \ ({ \ bool _ret = (cond); \ \ if (_ret) { \ bch_err_dev_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca); \ + bch2_io_error(ca, _type); \ } \ _ret; \ }) -#define bch2_dev_inum_io_err_on(cond, ca, ...) \ +#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ ({ \ bool _ret = (cond); \ \ if (_ret) { \ bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca); \ + bch2_io_error(ca, _type); \ } \ _ret; \ }) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 1b25f84e4b9c..a864de231b69 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -13,6 +13,7 @@ #include "btree_iter.h" #include "buckets.h" #include "checksum.h" +#include "compress.h" #include "debug.h" #include "disk_groups.h" #include "error.h" @@ -162,17 +163,19 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { - prt_printf(err, "value too big (%zu > %u)", - bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, + btree_ptr_val_too_big, + "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - return bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -181,17 +184,20 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, bch2_bkey_ptrs_to_text(out, c, k); } -int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { - prt_printf(err, "value too big (%zu > %zu)", - bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, + btree_ptr_v2_val_too_big, + "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - return bch2_bkey_ptrs_invalid(c, k, flags, err); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, @@ -372,19 +378,18 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + int ret = 0; - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { - prt_printf(err, "invalid nr_replicas (%u)", - r.v->nr_replicas); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, + reservation_key_nr_replicas_invalid, + "invalid nr_replicas (%u)", r.v->nr_replicas); +fsck_err: + return ret; } void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, @@ -757,18 +762,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, return i; } -static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) -{ - union bch_extent_entry *next = extent_entry_next(entry); - - /* stripes have ptrs, but their layout doesn't work with this code */ - BUG_ON(k.k->type == KEY_TYPE_stripe); - - memmove_u64s_down(entry, next, - (u64 *) bkey_val_end(k) - (u64 *) next); - k.k->u64s -= (u64 *) next - (u64 *) entry; -} - /* * Returns pointer to the next entry after the one being dropped: */ @@ -992,10 +985,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - struct bch_extent_crc_unpacked crc; - const struct bch_extent_ptr *ptr; - const struct bch_extent_stripe_ptr *ec; - struct bch_dev *ca; bool first = true; if (c) @@ -1006,9 +995,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *ptr = entry_to_ptr(entry); + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ? bch_dev_bkey_exists(c, ptr->dev) : NULL; @@ -1030,10 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " stale"); } break; + } case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: - case BCH_EXTENT_ENTRY_crc128: - crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + case BCH_EXTENT_ENTRY_crc128: { + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, @@ -1042,12 +1033,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bch2_csum_types[crc.csum_type], bch2_compression_types[crc.compression_type]); break; - case BCH_EXTENT_ENTRY_stripe_ptr: - ec = &entry->stripe_ptr; + } + case BCH_EXTENT_ENTRY_stripe_ptr: { + const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; prt_printf(out, "ec: idx %llu block %u", (u64) ec->idx, ec->block); break; + } + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + prt_str(out, "rebalance: target "); + if (c) + bch2_target_to_text(out, c, r->target); + else + prt_printf(out, "%u", r->target); + prt_str(out, " compression "); + bch2_compression_opt_to_text(out, r->compression); + break; + } default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1057,7 +1062,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -static int extent_ptr_invalid(const struct bch_fs *c, +static int extent_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, const struct bch_extent_ptr *ptr, @@ -1070,6 +1075,7 @@ static int extent_ptr_invalid(const struct bch_fs *c, u64 bucket; u32 bucket_offset; struct bch_dev *ca; + int ret = 0; if (!bch2_dev_exists2(c, ptr->dev)) { /* @@ -1080,41 +1086,33 @@ static int extent_ptr_invalid(const struct bch_fs *c, if (flags & BKEY_INVALID_WRITE) return 0; - prt_printf(err, "pointer to invalid device (%u)", ptr->dev); - return -BCH_ERR_invalid_bkey; + bkey_fsck_err(c, err, ptr_to_invalid_device, + "pointer to invalid device (%u)", ptr->dev); } ca = bch_dev_bkey_exists(c, ptr->dev); bkey_for_each_ptr(ptrs, ptr2) - if (ptr != ptr2 && ptr->dev == ptr2->dev) { - prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, + ptr_to_duplicate_device, + "multiple pointers to same device (%u)", ptr->dev); bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); - if (bucket >= ca->mi.nbuckets) { - prt_printf(err, "pointer past last bucket (%llu > %llu)", - bucket, ca->mi.nbuckets); - return -BCH_ERR_invalid_bkey; - } - - if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { - prt_printf(err, "pointer before first bucket (%llu < %u)", - bucket, ca->mi.first_bucket); - return -BCH_ERR_invalid_bkey; - } - - if (bucket_offset + size_ondisk > ca->mi.bucket_size) { - prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", + bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, + ptr_after_last_bucket, + "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); + bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, + ptr_before_first_bucket, + "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + ptr_spans_multiple_buckets, + "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, ca->mi.bucket_size); - return -BCH_ERR_invalid_bkey; - } - - return 0; +fsck_err: + return ret; } -int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -1124,24 +1122,22 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, unsigned size_ondisk = k.k->size; unsigned nonce = UINT_MAX; unsigned nr_ptrs = 0; - bool unwritten = false, have_ec = false, crc_since_last_ptr = false; - int ret; + bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; + int ret = 0; if (bkey_is_btree_ptr(k.k)) size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { - if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { - prt_printf(err, "invalid extent entry type (got %u, max %u)", - __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, + extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - if (bkey_is_btree_ptr(k.k) && - !extent_entry_is_ptr(entry)) { - prt_printf(err, "has non ptr field"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry), c, err, + btree_ptr_has_non_ptr, + "has non ptr field"); switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: @@ -1150,22 +1146,15 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, if (ret) return ret; - if (nr_ptrs && unwritten != entry->ptr.unwritten) { - prt_printf(err, "extent with unwritten and written ptrs"); - return -BCH_ERR_invalid_bkey; - } - - if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { - prt_printf(err, "has unwritten ptrs"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, + ptr_cached_and_erasure_coded, + "cached, erasure coded ptr"); - if (entry->ptr.cached && have_ec) { - prt_printf(err, "cached, erasure coded ptr"); - return -BCH_ERR_invalid_bkey; - } + if (!entry->ptr.unwritten) + have_written = true; + else + have_unwritten = true; - unwritten = entry->ptr.unwritten; have_ec = false; crc_since_last_ptr = false; nr_ptrs++; @@ -1175,72 +1164,77 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - if (crc.offset + crc.live_size > - crc.uncompressed_size) { - prt_printf(err, "checksum offset + key size > uncompressed size"); - return -BCH_ERR_invalid_bkey; - } - - size_ondisk = crc.compressed_size; - - if (!bch2_checksum_type_valid(c, crc.csum_type)) { - prt_printf(err, "invalid checksum type"); - return -BCH_ERR_invalid_bkey; - } - - if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { - prt_printf(err, "invalid compression type"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, + ptr_crc_uncompressed_size_too_small, + "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, + ptr_crc_csum_type_unknown, + "invalid checksum type"); + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, + ptr_crc_compression_type_unknown, + "invalid compression type"); if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; - else if (nonce != crc.offset + crc.nonce) { - prt_printf(err, "incorrect nonce"); - return -BCH_ERR_invalid_bkey; - } + else if (nonce != crc.offset + crc.nonce) + bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + "incorrect nonce"); } - if (crc_since_last_ptr) { - prt_printf(err, "redundant crc entry"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(crc_since_last_ptr, c, err, + ptr_crc_redundant, + "redundant crc entry"); crc_since_last_ptr = true; + + bkey_fsck_err_on(crc_is_encoded(crc) && + (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && + (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, + ptr_crc_uncompressed_size_too_big, + "too large encoded extent"); + + size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: - if (have_ec) { - prt_printf(err, "redundant stripe entry"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(have_ec, c, err, + ptr_stripe_redundant, + "redundant stripe entry"); have_ec = true; break; - case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + if (!bch2_compression_opt_valid(r->compression)) { + struct bch_compression_opt opt = __bch2_compression_decode(r->compression); + prt_printf(err, "invalid compression opt %u:%u", + opt.type, opt.level); + return -BCH_ERR_invalid_bkey; + } break; } + } } - if (!nr_ptrs) { - prt_str(err, "no ptrs"); - return -BCH_ERR_invalid_bkey; - } - - if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { - prt_str(err, "too many ptrs"); - return -BCH_ERR_invalid_bkey; - } - - if (crc_since_last_ptr) { - prt_printf(err, "redundant crc entry"); - return -BCH_ERR_invalid_bkey; - } - - if (have_ec) { - prt_printf(err, "redundant stripe entry"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + bkey_fsck_err_on(!nr_ptrs, c, err, + extent_ptrs_no_ptrs, + "no ptrs"); + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, + extent_ptrs_too_many_ptrs, + "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); + bkey_fsck_err_on(have_written && have_unwritten, c, err, + extent_ptrs_written_and_unwritten, + "extent with unwritten and written ptrs"); + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, + extent_ptrs_unwritten, + "has unwritten ptrs"); + bkey_fsck_err_on(crc_since_last_ptr, c, err, + extent_ptrs_redundant_crc, + "redundant crc entry"); + bkey_fsck_err_on(have_ec, c, err, + extent_ptrs_redundant_stripe, + "redundant stripe entry"); +fsck_err: + return ret; } void bch2_ptr_swab(struct bkey_s k) @@ -1281,6 +1275,125 @@ void bch2_ptr_swab(struct bkey_s k) } } +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) + return &entry->rebalance; + + return NULL; +} + +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, + unsigned target, unsigned compression) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned rewrite_ptrs = 0; + + if (compression) { + unsigned compression_type = bch2_compression_opt_to_type(compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) { + rewrite_ptrs = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= 1U << i; + i++; + } + } +incompressible: + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + const struct bch_extent_ptr *ptr; + unsigned i = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) + rewrite_ptrs |= 1U << i; + i++; + } + } + + return rewrite_ptrs; +} + +bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + /* + * If it's an indirect extent, we don't delete the rebalance entry when + * done so that we know what options were applied - check if it still + * needs work done: + */ + if (r && + k.k->type == KEY_TYPE_reflink_v && + !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) + r = NULL; + + return r != NULL; +} + +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, + unsigned target, unsigned compression) +{ + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *r; + bool needs_rebalance; + + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + /* get existing rebalance entry: */ + r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + if (r) { + if (k.k->type == KEY_TYPE_reflink_v) { + /* + * indirect extents: existing options take precedence, + * so that we don't move extents back and forth if + * they're referenced by different inodes with different + * options: + */ + if (r->target) + target = r->target; + if (r->compression) + compression = r->compression; + } + + r->target = target; + r->compression = compression; + } + + needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); + + if (needs_rebalance && !r) { + union bch_extent_entry *new = bkey_val_end(k); + + new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; + new->rebalance.compression = compression; + new->rebalance.target = target; + new->rebalance.unused = 0; + k.k->u64s += extent_entry_u64s(new); + } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { + /* + * For indirect extents, don't delete the rebalance entry when + * we're finished so that we know we specifically moved it or + * compressed it to its current location/compression type + */ + extent_entry_drop(k, (union bch_extent_entry *) r); + } + + return 0; +} + /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 879e7d218b6a..a2ce8a3be13c 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k, memcpy_u64s_small(dst, new, extent_entry_u64s(new)); } +static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *next = extent_entry_next(entry); + + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); + + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} + static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; @@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); } +static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) +{ + return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); +} + /* bkey_ptrs: generically over any key type that has ptrs */ struct bkey_ptrs_c { @@ -383,12 +400,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, @@ -428,7 +445,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ -int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -688,11 +705,19 @@ void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_ptr_swab(struct bkey_s); +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, + unsigned, unsigned); +bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); + +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, + unsigned, unsigned); + /* Generic extent code: */ enum bch_extent_overlap { @@ -737,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) k->size = new_size; } -/* - * In extent_sort_fix_overlapping(), insert_fixup_extent(), - * extent_merge_inline() - we're modifying keys in place that are packed. To do - * that we have to unpack the key, modify the unpacked key - then this - * copies/repacks the unpacked to the original as necessary. - */ -static inline void extent_save(struct btree *b, struct bkey_packed *dst, - struct bkey *src) -{ - struct bkey_format *f = &b->format; - struct bkey_i *dst_unpacked; - - if ((dst_unpacked = packed_to_bkey(dst))) - dst_unpacked->k = *src; - else - BUG_ON(!bch2_bkey_pack_key(dst, src, f)); -} - #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index bb5305441f27..4496cf91a4c1 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -51,7 +51,7 @@ int bch2_create_trans(struct btree_trans *trans, bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); if (flags & BCH_CREATE_TMPFILE) - new_inode->bi_flags |= BCH_INODE_UNLINKED; + new_inode->bi_flags |= BCH_INODE_unlinked; ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); if (ret) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 58ccc7b91ac7..52f0e7acda3d 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -389,6 +389,21 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs return ret; } +/* + * Determine when a writepage io is full. We have to limit writepage bios to a + * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to + * what the bounce path in bch2_write_extent() can handle. In theory we could + * loosen this restriction for non-bounce I/O, but we don't have that context + * here. Ideally, we can up this limit and make it configurable in the future + * when the bounce path can be enhanced to accommodate larger source bios. + */ +static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) +{ + struct bio *bio = &io->op.wbio.bio; + return bio_full(bio, len) || + (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); +} + static void bch2_writepage_io_done(struct bch_write_op *op) { struct bch_writepage_io *io = @@ -606,9 +621,7 @@ do_io: if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio, sectors << 9) || - w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= - (BIO_MAX_VECS * PAGE_SIZE) || + bch_io_full(w->io, sectors << 9) || bio_end_sector(&w->io->op.wbio.bio) != sector)) bch2_writepage_do_io(w); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 6a9557e7ecab..5b42a76c4796 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -113,6 +113,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) } else { atomic_set(&dio->cl.remaining, CLOSURE_REMAINING_INITIALIZER + 1); + dio->cl.closure_get_happened = true; } dio->req = req; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 6040bd3f0778..5a39bcb597a3 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -45,13 +45,13 @@ static int bch2_inode_flags_set(struct btree_trans *trans, unsigned newflags = s->flags; unsigned oldflags = bi->bi_flags & s->mask; - if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && + if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; if (!S_ISREG(bi->bi_mode) && !S_ISDIR(bi->bi_mode) && - (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) + (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) return -EINVAL; if (s->set_projinherit) { diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index 54a9c21a3b83..d30f9bb056fd 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -6,28 +6,28 @@ /* bcachefs inode flags -> vfs inode flags: */ static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_SYNC] = S_SYNC, - [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, - [__BCH_INODE_APPEND] = S_APPEND, - [__BCH_INODE_NOATIME] = S_NOATIME, + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, }; /* bcachefs inode flags -> FS_IOC_GETFLAGS: */ static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_SYNC] = FS_SYNC_FL, - [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, - [__BCH_INODE_APPEND] = FS_APPEND_FL, - [__BCH_INODE_NODUMP] = FS_NODUMP_FL, - [__BCH_INODE_NOATIME] = FS_NOATIME_FL, + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ static const __maybe_unused unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, - [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, - [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, - [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, + [__BCH_INODE_sync] = FS_XFLAG_SYNC, + [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_append] = FS_XFLAG_APPEND, + [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, + [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; }; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index a2a5133fb6b5..166d8d8abe68 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -764,15 +764,15 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); } - if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) + if (inode->ei_inode.bi_flags & BCH_INODE_immutable) stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= STATX_ATTR_IMMUTABLE; - if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) + if (inode->ei_inode.bi_flags & BCH_INODE_append) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= STATX_ATTR_APPEND; - if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) + if (inode->ei_inode.bi_flags & BCH_INODE_nodump) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= STATX_ATTR_NODUMP; @@ -1213,9 +1213,6 @@ static struct dentry *bch2_get_parent(struct dentry *child) .inum = inode->ei_inode.bi_dir, }; - if (!parent_inum.inum) - return NULL; - return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b8f9e7475dc5..9f3e9bd3d767 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_buf.h" +#include "btree_cache.h" #include "btree_update.h" #include "buckets.h" #include "darray.h" @@ -444,9 +445,10 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, if (i->equiv == n.equiv) { bch_err(c, "snapshot deletion did not finish:\n" " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", - bch2_btree_ids[btree_id], + bch2_btree_id_str(btree_id), pos.inode, pos.offset, i->id, n.id, n.equiv); + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); } } @@ -719,8 +721,9 @@ static int check_key_has_snapshot(struct btree_trans *trans, int ret = 0; if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + bkey_in_missing_snapshot, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; fsck_err: @@ -789,6 +792,7 @@ static int hash_check_key(struct btree_trans *trans, if (fsck_err_on(k.k->type == desc.key_type && !desc.cmp_bkey(k, hash_k), c, + hash_table_key_duplicate, "duplicate hash table keys:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), @@ -807,8 +811,9 @@ out: printbuf_exit(&buf); return ret; bad_hash: - if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, + if (fsck_err(c, hash_table_key_wrong_offset, + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", + bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); @@ -849,22 +854,23 @@ static int check_inode(struct btree_trans *trans, BUG_ON(bch2_inode_unpack(k, &u)); if (!full && - !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| - BCH_INODE_I_SECTORS_DIRTY| - BCH_INODE_UNLINKED))) + !(u.bi_flags & (BCH_INODE_i_size_dirty| + BCH_INODE_i_sectors_dirty| + BCH_INODE_unlinked))) return 0; if (prev->bi_inum != u.bi_inum) *prev = u; if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || - inode_d_type(prev) != inode_d_type(&u), c, + inode_d_type(prev) != inode_d_type(&u), + c, inode_snapshot_mismatch, "inodes in different snapshots don't match")) { bch_err(c, "repair not implemented yet"); return -EINVAL; } - if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) && + if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { struct bpos new_min_pos; @@ -872,7 +878,7 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; - u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED; + u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; ret = __write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); @@ -884,9 +890,10 @@ static int check_inode(struct btree_trans *trans, return 0; } - if (u.bi_flags & BCH_INODE_UNLINKED && + if (u.bi_flags & BCH_INODE_unlinked && (!c->sb.clean || - fsck_err(c, "filesystem marked clean, but inode %llu unlinked", + fsck_err(c, inode_unlinked_but_clean, + "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); @@ -896,9 +903,10 @@ static int check_inode(struct btree_trans *trans, return ret; } - if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && + if (u.bi_flags & BCH_INODE_i_size_dirty && (!c->sb.clean || - fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", + fsck_err(c, inode_i_size_dirty_but_clean, + "filesystem marked clean, but inode %llu has i_size dirty", u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); @@ -922,15 +930,16 @@ static int check_inode(struct btree_trans *trans, * We truncated without our normal sector accounting hook, just * make sure we recalculate it: */ - u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; + u.bi_flags |= BCH_INODE_i_sectors_dirty; - u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; + u.bi_flags &= ~BCH_INODE_i_size_dirty; do_update = true; } - if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && + if (u.bi_flags & BCH_INODE_i_sectors_dirty && (!c->sb.clean || - fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", + fsck_err(c, inode_i_sectors_dirty_but_clean, + "filesystem marked clean, but inode %llu has i_sectors dirty", u.bi_inum))) { s64 sectors; @@ -944,14 +953,14 @@ static int check_inode(struct btree_trans *trans, } u.bi_sectors = sectors; - u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; + u.bi_flags &= ~BCH_INODE_i_sectors_dirty; do_update = true; } - if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { + if (u.bi_flags & BCH_INODE_backptr_untrusted) { u.bi_dir = 0; u.bi_dir_offset = 0; - u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; + u.bi_flags &= ~BCH_INODE_backptr_untrusted; do_update = true; } @@ -1056,10 +1065,11 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) return -BCH_ERR_internal_fsck_err; } - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, - "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, - i->inode.bi_sectors, i->count)) { + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), + c, inode_i_sectors_wrong, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->last_pos.inode, i->snapshot, + i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; ret = fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) @@ -1200,7 +1210,8 @@ static int overlapping_extents_found(struct btree_trans *trans, prt_printf(&buf, "\n overwriting %s extent", pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); - if (fsck_err(c, "overlapping extents%s", buf.buf)) { + if (fsck_err(c, extent_overlapping, + "overlapping extents%s", buf.buf)) { struct btree_iter *old_iter = &iter1; struct disk_reservation res = { 0 }; @@ -1297,6 +1308,28 @@ err: return ret; } +static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc_is_encoded(crc) && + crc.uncompressed_size > encoded_extent_max_sectors) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); + printbuf_exit(&buf); + } + + return 0; +} + static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, struct inode_walker *inode, @@ -1333,7 +1366,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; if (k.k->type != KEY_TYPE_whiteout) { - if (fsck_err_on(!i, c, + if (fsck_err_on(!i, c, extent_in_missing_inode, "extent in missing inode:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -1341,7 +1374,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, if (fsck_err_on(i && !S_ISREG(i->inode.bi_mode) && - !S_ISLNK(i->inode.bi_mode), c, + !S_ISLNK(i->inode.bi_mode), + c, extent_in_non_reg_inode, "extent in non regular inode mode %o:\n %s", i->inode.bi_mode, (printbuf_reset(&buf), @@ -1371,9 +1405,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, continue; if (k.k->type != KEY_TYPE_whiteout) { - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && - !bkey_extent_is_reservation(k), c, + !bkey_extent_is_reservation(k), + c, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n %s", i->inode.bi_inum, i->snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1432,7 +1467,8 @@ int bch2_check_extents(struct bch_fs *c) &res, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ bch2_disk_reservation_put(c, &res); - check_extent(trans, &iter, k, &w, &s, &extent_ends); + check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent_overbig(trans, &iter, k); })) ?: check_i_sectors(trans, &w); @@ -1446,6 +1482,30 @@ int bch2_check_extents(struct bch_fs *c) return ret; } +int bch2_check_indirect_extents(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct disk_reservation res = { 0 }; + int ret = 0; + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + POS_MIN, + BTREE_ITER_PREFETCH, k, + &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ + bch2_disk_reservation_put(c, &res); + check_extent_overbig(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + bch2_trans_put(trans); + + bch_err_fn(c, ret); + return ret; +} + static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1470,7 +1530,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) continue; } - if (fsck_err_on(i->inode.bi_nlink != i->count, c, + if (fsck_err_on(i->inode.bi_nlink != i->count, + c, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; @@ -1514,27 +1575,28 @@ static int check_dirent_target(struct btree_trans *trans, backpointer_exists = ret; ret = 0; - if (fsck_err_on(S_ISDIR(target->bi_mode) && - backpointer_exists, c, + if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, + c, inode_dir_multiple_links, "directory %llu with multiple links", target->bi_inum)) { ret = __remove_dirent(trans, d.k->p); goto out; } - if (fsck_err_on(backpointer_exists && - !target->bi_nlink, c, + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + c, inode_multiple_links_but_nlink_0, "inode %llu type %s has multiple links but i_nlink 0", target->bi_inum, bch2_d_types[d.v->d_type])) { target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_UNLINKED; + target->bi_flags &= ~BCH_INODE_unlinked; ret = __write_inode(trans, target, target_snapshot); if (ret) goto err; } - if (fsck_err_on(!backpointer_exists, c, + if (fsck_err_on(!backpointer_exists, + c, inode_wrong_backpointer, "inode %llu:%u has wrong backpointer:\n" "got %llu:%llu\n" "should be %llu:%llu", @@ -1552,7 +1614,8 @@ static int check_dirent_target(struct btree_trans *trans, } } - if (fsck_err_on(d.v->d_type != inode_d_type(target), c, + if (fsck_err_on(d.v->d_type != inode_d_type(target), + c, dirent_d_type_wrong, "incorrect d_type: got %s, should be %s:\n%s", bch2_d_type_str(d.v->d_type), bch2_d_type_str(inode_d_type(target)), @@ -1576,7 +1639,8 @@ static int check_dirent_target(struct btree_trans *trans, if (d.v->d_type == DT_SUBVOL && target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && (c->sb.version < bcachefs_metadata_version_subvol_dirent || - fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", + fsck_err(c, dirent_d_parent_subvol_wrong, + "dirent has wrong d_parent_subvol field: got %u, should be %u", le32_to_cpu(d.v->d_parent_subvol), target->bi_parent_subvol))) { n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); @@ -1648,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); dir->first_this_inode = false; - if (fsck_err_on(!i, c, + if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, "dirent in nonexisting directory:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1660,7 +1724,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (!i) goto out; - if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), + c, dirent_in_non_dir_inode, "dirent in non directory inode type %s:\n%s", bch2_d_type_str(inode_d_type(&i->inode)), (printbuf_reset(&buf), @@ -1694,7 +1759,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - if (fsck_err_on(ret, c, + if (fsck_err_on(ret, c, dirent_to_missing_subvol, "dirent points to missing subvolume %u", le32_to_cpu(d.v->d_child_subvol))) { ret = __remove_dirent(trans, d.k->p); @@ -1706,7 +1771,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - if (fsck_err_on(ret, c, + if (fsck_err_on(ret, c, subvol_to_missing_root, "subvolume %u points to missing subvolume root %llu", target_subvol, target_inum)) { @@ -1715,7 +1780,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, + c, subvol_root_wrong_bi_subvol, "subvol root %llu has wrong bi_subvol field: got %u, should be %u", target_inum, subvol_root.bi_subvol, target_subvol)) { @@ -1734,7 +1800,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (fsck_err_on(!target->inodes.nr, c, + if (fsck_err_on(!target->inodes.nr, + c, dirent_to_missing_inode, "dirent points to missing inode: (equiv %u)\n%s", equiv.snapshot, (printbuf_reset(&buf), @@ -1820,7 +1887,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); inode->first_this_inode = false; - if (fsck_err_on(!i, c, + if (fsck_err_on(!i, c, xattr_in_missing_inode, "xattr for missing inode %llu", k.k->p.inode)) return bch2_btree_delete_at(trans, iter, 0); @@ -1869,7 +1936,8 @@ static int check_root_trans(struct btree_trans *trans) if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { + if (mustfix_fsck_err_on(ret, c, root_subvol_missing, + "root subvol missing")) { struct bkey_i_subvolume root_subvol; snapshot = U32_MAX; @@ -1895,8 +1963,10 @@ static int check_root_trans(struct btree_trans *trans) if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (mustfix_fsck_err_on(ret, c, "root directory missing") || - mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, + if (mustfix_fsck_err_on(ret, c, root_dir_missing, + "root directory missing") || + mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), + c, root_inode_not_dir, "root inode not a directory")) { bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); @@ -2000,7 +2070,8 @@ static int check_path(struct btree_trans *trans, } if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + if (fsck_err(c, inode_unreachable, + "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", inode->bi_inum, snapshot, bch2_d_type_str(inode_d_type(inode)), inode->bi_nlink, @@ -2040,7 +2111,8 @@ static int check_path(struct btree_trans *trans, pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); - if (!fsck_err(c, "directory structure loop")) + if (!fsck_err(c, dir_loop, + "directory structure loop")) return 0; ret = commit_do(trans, NULL, NULL, @@ -2088,7 +2160,7 @@ int bch2_check_directory_structure(struct bch_fs *c) break; } - if (u.bi_flags & BCH_INODE_UNLINKED) + if (u.bi_flags & BCH_INODE_unlinked) continue; ret = check_path(trans, &path, &u, iter.pos.snapshot); @@ -2300,7 +2372,8 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite link = &links->d[++*idx]; } - if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, + if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, + c, inode_wrong_nlink, "inode %llu type %s has wrong i_nlink (%u, should be %u)", u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 90c87b5089a0..da991e8cf27e 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -4,6 +4,7 @@ int bch2_check_inodes(struct bch_fs *); int bch2_check_extents(struct bch_fs *); +int bch2_check_indirect_extents(struct bch_fs *); int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index bb3f443d8381..def77f2d8802 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -6,6 +6,7 @@ #include "bkey_methods.h" #include "btree_update.h" #include "buckets.h" +#include "compress.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -19,13 +20,18 @@ #include <asm/unaligned.h> -const char * const bch2_inode_opts[] = { #define x(name, ...) #name, +const char * const bch2_inode_opts[] = { BCH_INODE_OPTS() -#undef x NULL, }; +static const char * const bch2_inode_flag_strs[] = { + BCH_INODE_FLAGS() + NULL +}; +#undef x + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; static int inode_decode_field(const u8 *in, const u8 *end, @@ -361,9 +367,10 @@ int bch2_inode_peek(struct btree_trans *trans, return ret; } -int bch2_inode_write(struct btree_trans *trans, +int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, - struct bch_inode_unpacked *inode) + struct bch_inode_unpacked *inode, + enum btree_update_flags flags) { struct bkey_inode_buf *inode_p; @@ -373,7 +380,7 @@ int bch2_inode_write(struct btree_trans *trans, bch2_inode_pack_inlined(inode_p, inode); inode_p->inode.k.p.snapshot = iter->snapshot; - return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); } struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) @@ -397,117 +404,121 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) return &inode_p->inode.k_i; } -static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) +static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bch_inode_unpacked unpacked; + int ret = 0; - if (k.k->p.inode) { - prt_printf(err, "nonzero k.p.inode"); - return -BCH_ERR_invalid_bkey; - } - - if (k.k->p.offset < BLOCKDEV_INODE_MAX) { - prt_printf(err, "fs inode in blockdev range"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->p.inode, c, err, + inode_pos_inode_nonzero, + "nonzero k.p.inode"); - if (bch2_inode_unpack(k, &unpacked)) { - prt_printf(err, "invalid variable length fields"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, + inode_pos_blockdev_range, + "fs inode in blockdev range"); - if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { - prt_printf(err, "invalid data checksum type (%u >= %u", - unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, + inode_unpack_error, + "invalid variable length fields"); - if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { - prt_printf(err, "invalid data checksum type (%u >= %u)", - unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, + inode_checksum_type_invalid, + "invalid data checksum type (%u >= %u", + unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && - unpacked.bi_nlink != 0) { - prt_printf(err, "flagged as unlinked but bi_nlink != 0"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(unpacked.bi_compression && + !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, + inode_compression_type_invalid, + "invalid compression opt %u", unpacked.bi_compression - 1); - if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { - prt_printf(err, "subvolume root but not a directory"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && + unpacked.bi_nlink != 0, c, err, + inode_unlinked_but_nlink_nonzero, + "flagged as unlinked but bi_nlink != 0"); - return 0; + bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, + inode_subvol_root_but_not_dir, + "subvolume root but not a directory"); +fsck_err: + return ret; } -int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + int ret = 0; - if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { - prt_printf(err, "invalid str hash type (%llu >= %u)", - INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - return __bch2_inode_invalid(k, err); + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; } -int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + int ret = 0; - if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { - prt_printf(err, "invalid str hash type (%llu >= %u)", - INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - return __bch2_inode_invalid(k, err); + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; } -int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + int ret = 0; - if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || - INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { - prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", - INODEv3_FIELDS_START(inode.v), - INODEv3_FIELDS_START_INITIAL, - bkey_val_u64s(inode.k)); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, + inode_v3_fields_start_bad, + "invalid fields_start (got %llu, min %u max %zu)", + INODEv3_FIELDS_START(inode.v), + INODEv3_FIELDS_START_INITIAL, + bkey_val_u64s(inode.k)); - if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { - prt_printf(err, "invalid str hash type (%llu >= %u)", - INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - return __bch2_inode_invalid(k, err); + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; } static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", - inode->bi_mode, inode->bi_flags, + prt_printf(out, "mode=%o ", inode->bi_mode); + + prt_str(out, "flags="); + prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); + prt_printf(out, " (%x)", inode->bi_flags); + + prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", inode->bi_journal_seq, inode->bi_size, inode->bi_sectors, inode->bi_version); #define x(_name, _bits) \ - prt_printf(out, " "#_name " %llu", (u64) inode->_name); + prt_printf(out, " "#_name "=%llu", (u64) inode->_name); BCH_INODE_FIELDS_v3() #undef x } @@ -546,7 +557,7 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k) static inline bool bkey_is_deleted_inode(struct bkey_s_c k) { - return bkey_inode_flags(k) & BCH_INODE_UNLINKED; + return bkey_inode_flags(k) & BCH_INODE_unlinked; } int bch2_trans_mark_inode(struct btree_trans *trans, @@ -610,16 +621,17 @@ int bch2_mark_inode(struct btree_trans *trans, return 0; } -int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (k.k->p.inode) { - prt_printf(err, "nonzero k.p.inode"); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(k.k->p.inode, c, err, + inode_pos_inode_nonzero, + "nonzero k.p.inode"); +fsck_err: + return ret; } void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, @@ -926,8 +938,8 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { - if (bi->bi_flags & BCH_INODE_UNLINKED) - bi->bi_flags &= ~BCH_INODE_UNLINKED; + if (bi->bi_flags & BCH_INODE_unlinked) + bi->bi_flags &= ~BCH_INODE_unlinked; else { if (bi->bi_nlink == U32_MAX) return -EINVAL; @@ -940,13 +952,13 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) { - if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { + if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", bi->bi_inum); return; } - if (bi->bi_flags & BCH_INODE_UNLINKED) { + if (bi->bi_flags & BCH_INODE_unlinked) { bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); return; } @@ -954,7 +966,7 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked * if (bi->bi_nlink) bi->bi_nlink--; else - bi->bi_flags |= BCH_INODE_UNLINKED; + bi->bi_flags |= BCH_INODE_unlinked; } struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) @@ -979,6 +991,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; } +int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) +{ + struct bch_inode_unpacked inode; + int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); + + if (ret) + return ret; + + bch2_inode_opts_get(opts, trans->c, &inode); + return 0; +} + int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; @@ -1042,53 +1066,85 @@ err: return ret ?: -BCH_ERR_transaction_restart_nested; } -static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) +static int may_delete_deleted_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos, + bool *need_another_pass) { struct bch_fs *c = trans->c; - struct btree_iter iter; + struct btree_iter inode_iter; struct bkey_s_c k; struct bch_inode_unpacked inode; int ret; - if (bch2_snapshot_is_internal_node(c, pos.snapshot)) - return 0; - - if (!fsck_err_on(c->sb.clean, c, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) - return 0; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) return ret; ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; if (fsck_err_on(!bkey_is_inode(k.k), c, + deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; ret = bch2_inode_unpack(k, &inode); if (ret) - goto err; + goto out; if (fsck_err_on(S_ISDIR(inode.bi_mode), c, + deleted_inode_is_dir, "directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, + if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, + deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; - return 1; -err: + if (c->sb.clean && + !fsck_err(c, + deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) { + ret = 0; + goto out; + } + + if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { + struct bpos new_min_pos; + + ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); + if (ret) + goto out; + + inode.bi_flags &= ~BCH_INODE_unlinked; + + ret = bch2_inode_write_flags(trans, &inode_iter, &inode, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch_err_msg(c, ret, "clearing inode unlinked flag"); + if (ret) + goto out; + + /* + * We'll need another write buffer flush to pick up the new + * unlinked inodes in the snapshot leaves: + */ + *need_another_pass = true; + return 0; + } + + ret = 1; +out: fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); return ret; delete: - return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + goto out; } int bch2_delete_dead_inodes(struct bch_fs *c) @@ -1096,7 +1152,10 @@ int bch2_delete_dead_inodes(struct bch_fs *c) struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; + bool need_another_pass; int ret; +again: + need_another_pass = false; ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) @@ -1110,7 +1169,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c) */ for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p)); + ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p, + &need_another_pass)); if (ret < 0) break; @@ -1120,12 +1180,17 @@ int bch2_delete_dead_inodes(struct bch_fs *c) bch2_fs_lazy_rw(c); } + bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); + ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } } bch2_trans_iter_exit(trans, &iter); + + if (!ret && need_another_pass) + goto again; err: bch2_trans_put(trans); diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index a7464e1b6960..88818a332b1e 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -3,16 +3,17 @@ #define _BCACHEFS_INODE_H #include "bkey.h" +#include "bkey_methods.h" #include "opts.h" enum bkey_invalid_flags; extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); -int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -52,7 +53,7 @@ static inline bool bkey_is_inode(const struct bkey *k) k->type == KEY_TYPE_inode_v3; } -int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -101,8 +102,16 @@ void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *) int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); -int bch2_inode_write(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *); + +int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, enum btree_update_flags); + +static inline int bch2_inode_write(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode) +{ + return bch2_inode_write_flags(trans, iter, inode, 0); +} void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); @@ -177,7 +186,7 @@ static inline unsigned nlink_bias(umode_t mode) static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) { - return bi->bi_flags & BCH_INODE_UNLINKED + return bi->bi_flags & BCH_INODE_unlinked ? 0 : bi->bi_nlink + nlink_bias(bi->bi_mode); } @@ -187,10 +196,10 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, { if (nlink) { bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); - bi->bi_flags &= ~BCH_INODE_UNLINKED; + bi->bi_flags &= ~BCH_INODE_unlinked; } else { bi->bi_nlink = 0; - bi->bi_flags |= BCH_INODE_UNLINKED; + bi->bi_flags |= BCH_INODE_unlinked; } } @@ -200,6 +209,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); +int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 119834cb8f9e..bebc11444ef5 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -16,13 +16,14 @@ #include "io_misc.h" #include "io_write.h" #include "logged_ops.h" +#include "rebalance.h" #include "subvolume.h" /* Overwrites whatever was present with zeroes: */ int bch2_extent_fallocate(struct btree_trans *trans, subvol_inum inum, struct btree_iter *iter, - unsigned sectors, + u64 sectors, struct bch_io_opts opts, s64 *i_sectors_delta, struct write_point_specifier write_point) @@ -104,7 +105,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, if (ret) goto err; - sectors = min(sectors, wp->sectors_free); + sectors = min_t(u64, sectors, wp->sectors_free); sectors_allocated = sectors; bch2_key_resize(&e->k, sectors); @@ -355,6 +356,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, struct btree_iter iter; struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + struct bch_io_opts opts; u64 dst_offset = le64_to_cpu(op->v.dst_offset); u64 src_offset = le64_to_cpu(op->v.src_offset); s64 shift = dst_offset - src_offset; @@ -363,6 +365,10 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, bool insert = shift > 0; int ret = 0; + ret = bch2_inum_opts_get(trans, inum, &opts); + if (ret) + return ret; + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, 0), BTREE_ITER_INTENT); @@ -443,7 +449,10 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + ret = bch2_bkey_set_needs_rebalance(c, copy, + opts.background_target, + opts.background_compression) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h index c9e6ed40e1b8..9cb44a7c43c1 100644 --- a/fs/bcachefs/io_misc.h +++ b/fs/bcachefs/io_misc.h @@ -3,7 +3,7 @@ #define _BCACHEFS_IO_MISC_H int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, - unsigned, struct bch_io_opts, s64 *, + u64, struct bch_io_opts, s64 *, struct write_point_specifier); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, subvol_inum, u64, s64 *); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 443c3ea65527..a56ed553dc15 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -643,7 +643,7 @@ csum_err: "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); - bch2_io_error(ca); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; decompression_err: @@ -677,7 +677,7 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, rbio->read_pos.inode, rbio->read_pos.offset, "data read error: %s", @@ -1025,7 +1025,7 @@ get_bio: trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(c, rbio)) { + if (bch2_ec_read_extent(trans, rbio)) { bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 6e4f85eb6ec8..f02b3f7d26a0 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct btree_iter iter; struct bkey_i *k; struct bkey_i_inode_v3 *inode; + /* + * Crazy performance optimization: + * Every extent update needs to also update the inode: the inode trigger + * will set bi->journal_seq to the journal sequence number of this + * transaction - for fsync. + * + * But if that's the only reason we're updating the inode (we're not + * updating bi_size or bi_sectors), then we don't need the inode update + * to be journalled - if we crash, the bi_journal_seq update will be + * lost, but that's fine. + */ unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; int ret; @@ -223,7 +234,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, inode = bkey_i_to_inode_v3(k); - if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && new_i_size > le64_to_cpu(inode->v.bi_size)) { inode->v.bi_size = cpu_to_le64(new_i_size); inode_update_flags = 0; @@ -351,10 +362,13 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_extent_update(trans, inum, &iter, sk.k, - &op->res, - op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); + ret = bch2_bkey_set_needs_rebalance(c, sk.k, + op->opts.background_target, + op->opts.background_compression) ?: + bch2_extent_update(trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -495,7 +509,6 @@ static void __bch2_write_index(struct bch_write_op *op) { struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; - struct bkey_i *k; unsigned dev; int ret = 0; @@ -505,14 +518,6 @@ static void __bch2_write_index(struct bch_write_op *op) goto err; } - /* - * probably not the ideal place to hook this in, but I don't - * particularly want to plumb io_opts all the way through the btree - * update stack right now - */ - for_each_keylist_key(keys, k) - bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); - if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); @@ -643,7 +648,7 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, op->pos.inode, wbio->inode_offset << 9, "data write error: %s", @@ -816,6 +821,7 @@ static enum prep_encoded_ret { /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && + op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && op->crc.compressed_size <= wp->sectors_free && (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { @@ -1091,9 +1097,7 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, e = bkey_s_c_to_extent(k); extent_for_each_ptr_decode(e, p, entry) { - if (p.crc.csum_type || - crc_is_compressed(p.crc) || - p.has_ec) + if (crc_is_encoded(p.crc) || p.has_ec) return false; replicas += bch2_extent_ptr_durability(c, &p); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 0e7a9ffa3671..5b5d69f2316b 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1019,6 +1019,25 @@ err: return ret; } +int bch2_fs_journal_alloc(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) { + if (ca->journal.nr) + continue; + + int ret = bch2_dev_journal_alloc(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + return ret; + } + } + + return 0; +} + /* startup/shutdown: */ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 491133cc52f3..011711e99c8d 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -534,6 +534,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 6a3d6a374e9c..f4bc2cdbfdd7 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -140,7 +140,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, if (!dup->csum_good) goto replace; - fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", + fsck_err(c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); i = dup; goto found; @@ -235,7 +236,7 @@ static void journal_entry_err_msg(struct printbuf *out, prt_str(out, ": "); } -#define journal_entry_err(c, version, jset, entry, msg, ...) \ +#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ ({ \ struct printbuf _buf = PRINTBUF; \ \ @@ -244,9 +245,10 @@ static void journal_entry_err_msg(struct printbuf *out, \ switch (flags & BKEY_INVALID_WRITE) { \ case READ: \ - mustfix_fsck_err(c, "%s", _buf.buf); \ + mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ case WRITE: \ + bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ if (bch2_fs_inconsistent(c)) { \ ret = -BCH_ERR_fsck_errors_not_fixed; \ @@ -259,8 +261,8 @@ static void journal_entry_err_msg(struct printbuf *out, true; \ }) -#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \ - ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false) +#define journal_entry_err_on(cond, ...) \ + ((cond) ? journal_entry_err(__VA_ARGS__) : false) #define FSCK_DELETED_KEY 5 @@ -277,7 +279,10 @@ static int journal_validate_key(struct bch_fs *c, struct printbuf buf = PRINTBUF; int ret = 0; - if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) { + if (journal_entry_err_on(!k->k.u64s, + c, version, jset, entry, + journal_entry_bkey_u64s_0, + "k->u64s 0")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; @@ -286,6 +291,7 @@ static int journal_validate_key(struct bch_fs *c, if (journal_entry_err_on((void *) bkey_next(k) > (void *) vstruct_next(entry), c, version, jset, entry, + journal_entry_bkey_past_end, "extends past end of journal entry")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); @@ -294,6 +300,7 @@ static int journal_validate_key(struct bch_fs *c, if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, version, jset, entry, + journal_entry_bkey_bad_format, "bad format %u", k->k.format)) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -317,7 +324,8 @@ static int journal_validate_key(struct bch_fs *c, bch2_bkey_invalid(c, bkey_i_to_s_c(k), __btree_node_type(level, btree_id), write, &buf); - mustfix_fsck_err(c, "%s", buf.buf); + mustfix_fsck_err(c, journal_entry_bkey_invalid, + "%s", buf.buf); le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -369,7 +377,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs prt_newline(out); prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); } - prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); + prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); first = false; } @@ -387,6 +395,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, c, version, jset, entry, + journal_entry_btree_root_bad_size, "invalid btree root journal entry: wrong number of keys")) { void *next = vstruct_next(entry); /* @@ -436,6 +445,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, version, jset, entry, + journal_entry_blacklist_bad_size, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -463,6 +473,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, version, jset, entry, + journal_entry_blacklist_v2_bad_size, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); goto out; @@ -473,6 +484,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > le64_to_cpu(bl_entry->end), c, version, jset, entry, + journal_entry_blacklist_v2_start_past_end, "invalid journal seq blacklist entry: start > end")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -505,6 +517,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, if (journal_entry_err_on(bytes < sizeof(*u), c, version, jset, entry, + journal_entry_usage_bad_size, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -539,6 +552,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, if (journal_entry_err_on(bytes < sizeof(*u) || bytes < sizeof(*u) + u->r.nr_devs, c, version, jset, entry, + journal_entry_data_usage_bad_size, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -570,13 +584,17 @@ static int journal_entry_clock_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes != sizeof(*clock), - c, version, jset, entry, "bad size")) { + c, version, jset, entry, + journal_entry_clock_bad_size, + "bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(clock->rw > 1, - c, version, jset, entry, "bad rw")) { + c, version, jset, entry, + journal_entry_clock_bad_rw, + "bad rw")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -608,7 +626,9 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < expected, - c, version, jset, entry, "bad size (%u < %u)", + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", bytes, expected)) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -617,13 +637,17 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, dev = le32_to_cpu(u->dev); if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, version, jset, entry, "bad dev")) { + c, version, jset, entry, + journal_entry_dev_usage_bad_dev, + "bad dev")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(u->pad, - c, version, jset, entry, "bad pad")) { + c, version, jset, entry, + journal_entry_dev_usage_bad_pad, + "bad pad")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -738,7 +762,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, vstruct_for_each(jset, entry) { if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), - c, version, jset, entry, + c, version, jset, entry, + journal_entry_past_jset_end, "journal entry extends past end of jset")) { jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); break; @@ -767,6 +792,7 @@ static int jset_validate(struct bch_fs *c, version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, + jset_unsupported_version, "%s sector %llu seq %llu: incompatible journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -777,7 +803,8 @@ static int jset_validate(struct bch_fs *c, } if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), - c, version, jset, NULL, + c, version, jset, NULL, + jset_unknown_csum, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -788,6 +815,7 @@ static int jset_validate(struct bch_fs *c, if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, version, jset, NULL, + jset_last_seq_newer_than_seq, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(jset->last_seq), le64_to_cpu(jset->seq))) { @@ -816,7 +844,8 @@ static int jset_validate_early(struct bch_fs *c, version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), - c, version, jset, NULL, + c, version, jset, NULL, + jset_unsupported_version, "%s sector %llu seq %llu: unknown journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -831,7 +860,8 @@ static int jset_validate_early(struct bch_fs *c, return JOURNAL_ENTRY_REREAD; if (journal_entry_err_on(bytes > bucket_sectors_left << 9, - c, version, jset, NULL, + c, version, jset, NULL, + jset_past_bucket_end, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), bytes)) @@ -900,7 +930,7 @@ reread: ret = submit_bio_wait(bio); kfree(bio); - if (bch2_dev_io_err_on(ret, ca, + if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, "journal read error: sector %llu", offset) || bch2_meta_read_fault("journal")) { @@ -956,7 +986,8 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); csum_good = jset_csum_good(c, j); - if (!csum_good) + if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, + "journal checksum error")) saw_bad = true; ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), @@ -1172,6 +1203,7 @@ int bch2_journal_read(struct bch_fs *c, if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), c, le32_to_cpu(i->j.version), &i->j, NULL, + jset_last_seq_newer_than_seq, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(i->j.last_seq), le64_to_cpu(i->j.seq))) @@ -1188,7 +1220,8 @@ int bch2_journal_read(struct bch_fs *c, } if (!*last_seq) { - fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, + "journal read done, but no entries found after dropping non-flushes"); return 0; } @@ -1214,6 +1247,7 @@ int bch2_journal_read(struct bch_fs *c, if (bch2_journal_seq_is_blacklisted(c, seq, true)) { fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + jset_seq_blacklisted, "found blacklisted journal entry %llu", seq); i->ignore = true; } @@ -1254,7 +1288,8 @@ int bch2_journal_read(struct bch_fs *c, bch2_journal_ptrs_to_text(&buf2, c, i); missing_end = seq - 1; - fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + fsck_err(c, journal_entries_missing, + "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" " prev at %s\n" " next at %s", missing_start, missing_end, @@ -1309,7 +1344,8 @@ int bch2_journal_read(struct bch_fs *c, if (!degraded && !bch2_replicas_marked(c, &replicas.e) && (le64_to_cpu(i->j.seq) == *last_seq || - fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n %s", + fsck_err(c, journal_entry_replicas_not_marked, + "superblock not marked as containing replicas for journal entry %llu\n %s", le64_to_cpu(i->j.seq), buf.buf))) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) @@ -1581,7 +1617,8 @@ static void journal_write_endio(struct bio *bio) struct journal_buf *w = journal_last_unwritten_buf(j); unsigned long flags; - if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { @@ -1641,9 +1678,15 @@ static void do_journal_write(struct closure *cl) continue_at(cl, journal_write_done, c->io_complete_wq); } -static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) +static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) { - struct jset_entry *i, *next, *prev = NULL; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset_entry *start, *end, *i, *next, *prev = NULL; + struct jset *jset = w->data; + unsigned sectors, bytes, u64s; + bool validate_before_checksum = false; + unsigned long btree_roots_have = 0; + int ret; /* * Simple compaction, dropping empty jset_entries (from journal @@ -1660,8 +1703,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset if (!u64s) continue; - if (i->type == BCH_JSET_ENTRY_btree_root) + /* + * New btree roots are set by journalling them; when the journal + * entry gets written we have to propagate them to + * c->btree_roots + * + * But, every journal entry we write has to contain all the + * btree roots (at least for now); so after we copy btree roots + * to c->btree_roots we have to get any missing btree roots and + * add them to this journal entry: + */ + if (i->type == BCH_JSET_ENTRY_btree_root) { bch2_journal_entry_to_btree_root(c, i); + __set_bit(i->btree_id, &btree_roots_have); + } /* Can we merge with previous entry? */ if (prev && @@ -1685,85 +1740,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset prev = prev ? vstruct_next(prev) : jset->start; jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -} - -void bch2_journal_write(struct closure *cl) -{ - struct journal *j = container_of(cl, struct journal, io); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_last_unwritten_buf(j); - struct bch_replicas_padded replicas; - struct jset_entry *start, *end; - struct jset *jset; - struct bio *bio; - struct printbuf journal_debug_buf = PRINTBUF; - bool validate_before_checksum = false; - unsigned i, sectors, bytes, u64s, nr_rw_members = 0; - int ret; - - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - - journal_buf_realloc(j, w); - jset = w->data; - - j->write_start_time = local_clock(); - - spin_lock(&j->lock); - - /* - * If the journal is in an error state - we did an emergency shutdown - - * we prefer to continue doing journal writes. We just mark them as - * noflush so they'll never be used, but they'll still be visible by the - * list_journal tool - this helps in debugging. - * - * There's a caveat: the first journal write after marking the - * superblock dirty must always be a flush write, because on startup - * from a clean shutdown we didn't necessarily read the journal and the - * new journal write might overwrite whatever was in the journal - * previously - we can't leave the journal without any flush writes in - * it. - * - * So if we're in an error state, and we're still starting up, we don't - * write anything at all. - */ - if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && - (bch2_journal_error(j) || - w->noflush || - (!w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { - w->noflush = true; - SET_JSET_NO_FLUSH(jset, true); - jset->last_seq = 0; - w->last_seq = 0; - - j->nr_noflush_writes++; - } else if (!bch2_journal_error(j)) { - j->last_flush_write = jiffies; - j->nr_flush_writes++; - clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); - } else { - spin_unlock(&j->lock); - goto err; - } - spin_unlock(&j->lock); - - /* - * New btree roots are set by journalling them; when the journal entry - * gets written we have to propagate them to c->btree_roots - * - * But, every journal entry we write has to contain all the btree roots - * (at least for now); so after we copy btree roots to c->btree_roots we - * have to get any missing btree roots and add them to this journal - * entry: - */ - - bch2_journal_entries_postprocess(c, jset); start = end = vstruct_last(jset); - end = bch2_btree_roots_to_journal_entries(c, jset->start, end); + end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); bch2_journal_super_entries_add_common(c, &end, le64_to_cpu(jset->seq)); @@ -1779,7 +1759,7 @@ void bch2_journal_write(struct closure *cl) bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", vstruct_bytes(jset), w->sectors << 9, u64s, w->u64s_reserved, j->entry_u64s_reserved); - goto err; + return -EINVAL; } jset->magic = cpu_to_le64(jset_magic(c)); @@ -1798,37 +1778,117 @@ void bch2_journal_write(struct closure *cl) validate_before_checksum = true; if (validate_before_checksum && - jset_validate(c, NULL, jset, 0, WRITE)) - goto err; + (ret = jset_validate(c, NULL, jset, 0, WRITE))) + return ret; ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); if (bch2_fs_fatal_err_on(ret, c, "error decrypting journal entry: %i", ret)) - goto err; + return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (!validate_before_checksum && - jset_validate(c, NULL, jset, 0, WRITE)) - goto err; + (ret = jset_validate(c, NULL, jset, 0, WRITE))) + return ret; memset((void *) jset + bytes, 0, (sectors << 9) - bytes); + return 0; +} + +static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int error = bch2_journal_error(j); + + /* + * If the journal is in an error state - we did an emergency shutdown - + * we prefer to continue doing journal writes. We just mark them as + * noflush so they'll never be used, but they'll still be visible by the + * list_journal tool - this helps in debugging. + * + * There's a caveat: the first journal write after marking the + * superblock dirty must always be a flush write, because on startup + * from a clean shutdown we didn't necessarily read the journal and the + * new journal write might overwrite whatever was in the journal + * previously - we can't leave the journal without any flush writes in + * it. + * + * So if we're in an error state, and we're still starting up, we don't + * write anything at all. + */ + if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) + return -EIO; + + if (error || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + w->noflush = true; + SET_JSET_NO_FLUSH(w->data, true); + w->data->last_seq = 0; + w->last_seq = 0; + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + } + + return 0; +} + +void bch2_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_replicas_padded replicas; + struct bio *bio; + struct printbuf journal_debug_buf = PRINTBUF; + unsigned i, nr_rw_members = 0; + int ret; + + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + j->write_start_time = local_clock(); -retry_alloc: spin_lock(&j->lock); - ret = journal_write_alloc(j, w); + ret = bch2_journal_write_pick_flush(j, w); + spin_unlock(&j->lock); + if (ret) + goto err; + + journal_buf_realloc(j, w); + + ret = bch2_journal_write_prep(j, w); + if (ret) + goto err; + + while (1) { + spin_lock(&j->lock); + ret = journal_write_alloc(j, w); + if (!ret || !j->can_discard) + break; - if (ret && j->can_discard) { spin_unlock(&j->lock); bch2_journal_do_discards(j); - goto retry_alloc; } - if (ret) + if (ret) { __bch2_journal_debug_to_text(&journal_debug_buf, j); + spin_unlock(&j->lock); + bch_err(c, "Unable to allocate journal write:\n%s", + journal_debug_buf.buf); + printbuf_exit(&journal_debug_buf); + goto err; + } /* * write is allocated, no longer need to account for it in @@ -1843,13 +1903,6 @@ retry_alloc: bch2_journal_space_available(j); spin_unlock(&j->lock); - if (ret) { - bch_err(c, "Unable to allocate journal write:\n%s", - journal_debug_buf.buf); - printbuf_exit(&journal_debug_buf); - goto err; - } - w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); if (c->opts.nochanges) @@ -1871,7 +1924,7 @@ retry_alloc: if (ret) goto err; - if (!JSET_NO_FLUSH(jset) && w->separate_flush) { + if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { for_each_rw_member(ca, c, i) { percpu_ref_get(&ca->io_ref); diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 215a653322f3..a5cc0ed195d6 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -10,17 +10,17 @@ #include "recovery.h" /* KEY_TYPE_lru is obsolete: */ -int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (!lru_pos_time(k.k->p)) { - prt_printf(err, "lru entry at time=0"); - return -BCH_ERR_invalid_bkey; - - } + int ret = 0; - return 0; + bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, + lru_entry_at_time_0, + "lru entry at time=0"); +fsck_err: + return ret; } void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, @@ -95,6 +95,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, int ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, + lru_entry_to_invalid_bucket, "lru key points to nonexistent device:bucket %llu:%llu", alloc_pos.inode, alloc_pos.offset)) return bch2_btree_delete_at(trans, lru_iter, 0); @@ -125,7 +126,8 @@ static int bch2_check_lru_key(struct btree_trans *trans, } if (c->opts.reconstruct_alloc || - fsck_err(c, "incorrect lru entry: lru %s time %llu\n" + fsck_err(c, lru_entry_bad, + "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", bch2_lru_types[type], diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index be66bf9ad809..429dca816df5 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -48,7 +48,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 39a14e321680..ab749bf2fcbc 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -20,6 +20,7 @@ #include "keylist.h" #include "move.h" #include "replicas.h" +#include "snapshot.h" #include "super-io.h" #include "trace.h" @@ -59,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c } } -static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_add(&stats->list, &c->data_progress_list); - mutex_unlock(&c->data_progress_lock); -} - -static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) -{ - mutex_lock(&c->data_progress_lock); - list_del(&stats->list); - mutex_unlock(&c->data_progress_lock); -} - struct moving_io { struct list_head read_list; struct list_head io_list; @@ -156,35 +143,31 @@ static void move_read_endio(struct bio *bio) closure_put(&ctxt->cl); } -void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, - struct btree_trans *trans) +void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) { struct moving_io *io; - if (trans) - bch2_trans_unlock(trans); - while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { + bch2_trans_unlock_long(ctxt->trans); list_del(&io->read_list); move_write(io); } } -static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, - struct btree_trans *trans) +void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) { unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - move_ctxt_wait_event(ctxt, trans, + move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->write_sectors) || atomic_read(&ctxt->write_sectors) != sectors_pending); } void bch2_moving_ctxt_exit(struct moving_context *ctxt) { - struct bch_fs *c = ctxt->c; + struct bch_fs *c = ctxt->trans->c; - move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); closure_sync(&ctxt->cl); EBUG_ON(atomic_read(&ctxt->write_sectors)); @@ -192,16 +175,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) EBUG_ON(atomic_read(&ctxt->read_sectors)); EBUG_ON(atomic_read(&ctxt->read_ios)); - if (ctxt->stats) { - progress_list_del(c, ctxt->stats); - trace_move_data(c, - atomic64_read(&ctxt->stats->sectors_moved), - atomic64_read(&ctxt->stats->keys_moved)); - } - mutex_lock(&c->moving_context_lock); list_del(&ctxt->list); mutex_unlock(&c->moving_context_lock); + + bch2_trans_put(ctxt->trans); + memset(ctxt, 0, sizeof(*ctxt)); } void bch2_moving_ctxt_init(struct moving_context *ctxt, @@ -213,7 +192,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, { memset(ctxt, 0, sizeof(*ctxt)); - ctxt->c = c; + ctxt->trans = bch2_trans_get(c); ctxt->fn = (void *) _RET_IP_; ctxt->rate = rate; ctxt->stats = stats; @@ -230,16 +209,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, mutex_lock(&c->moving_context_lock); list_add(&ctxt->list, &c->moving_context_list); mutex_unlock(&c->moving_context_lock); +} - if (stats) { - progress_list_add(c, stats); - stats->data_type = BCH_DATA_user; - } +void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) +{ + trace_move_data(c, stats); } void bch2_move_stats_init(struct bch_move_stats *stats, char *name) { memset(stats, 0, sizeof(*stats)); + stats->data_type = BCH_DATA_user; scnprintf(stats->name, sizeof(stats->name), "%s", name); } @@ -286,15 +266,14 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); } -static int bch2_move_extent(struct btree_trans *trans, - struct btree_iter *iter, - struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bch_io_opts io_opts, - enum btree_id btree_id, - struct bkey_s_c k, - struct data_update_opts data_opts) +int bch2_move_extent(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_io_opts io_opts, + struct data_update_opts data_opts) { + struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct moving_io *io; @@ -303,6 +282,8 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + if (ctxt->stats) + ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); trace_move_extent2(c, k); bch2_data_update_opts_normalize(k, &data_opts); @@ -355,7 +336,7 @@ static int bch2_move_extent(struct btree_trans *trans, io->rbio.bio.bi_end_io = move_read_endio; ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, btree_id, k); + io_opts, data_opts, iter->btree_id, k); if (ret && ret != -BCH_ERR_unwritten_extent_update) goto err_free_pages; @@ -367,9 +348,11 @@ static int bch2_move_extent(struct btree_trans *trans, BUG_ON(ret); - io->write.ctxt = ctxt; io->write.op.end_io = move_write_done; + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); + if (ctxt->stats) { atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); @@ -399,7 +382,7 @@ static int bch2_move_extent(struct btree_trans *trans, closure_get(&ctxt->cl); bch2_read_extent(trans, &io->rbio, bkey_start_pos(k.k), - btree_id, k, 0, + iter->btree_id, k, 0, BCH_READ_NODECODE| BCH_READ_LAST_FRAGMENT); return 0; @@ -413,45 +396,96 @@ err: return ret; } -static int lookup_inode(struct btree_trans *trans, struct bpos pos, - struct bch_inode_unpacked *inode) +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, + struct bkey_s_c extent_k) +{ + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; + int ret = 0; + + if (io_opts->cur_inum != extent_k.k->p.inode) { + struct btree_iter iter; + struct bkey_s_c k; + + io_opts->d.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != extent_k.k->p.inode) + break; + + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + BUG_ON(bch2_inode_unpack(k, &inode)); + + struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; + bch2_inode_opts_get(&e.io_opts, trans->c, &inode); + + ret = darray_push(&io_opts->d, e); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + io_opts->cur_inum = extent_k.k->p.inode; + } + + ret = ret ?: trans_was_restarted(trans, restart_count); + if (ret) + return ERR_PTR(ret); + + if (extent_k.k->p.snapshot) { + struct snapshot_io_opts_entry *i; + darray_for_each(io_opts->d, i) + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) + return &i->io_opts; + } + + return &io_opts->fs_io_opts; +} + +int bch2_move_get_io_opts_one(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct bkey_s_c extent_k) { struct btree_iter iter; struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(&iter); + /* reflink btree? */ + if (!extent_k.k->p.inode) { + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + return 0; + } + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_CACHED); ret = bkey_err(k); - if (ret) - goto err; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; - if (!k.k || !bkey_eq(k.k->p, pos)) { - ret = -BCH_ERR_ENOENT_inode; - goto err; + if (!ret && bkey_is_inode(k.k)) { + struct bch_inode_unpacked inode; + bch2_inode_unpack(k, &inode); + bch2_inode_opts_get(io_opts, trans->c, &inode); + } else { + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); } - ret = bkey_is_inode(k.k) ? 0 : -EIO; - if (ret) - goto err; - - ret = bch2_inode_unpack(k, inode); - if (ret) - goto err; -err: bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } -static int move_ratelimit(struct btree_trans *trans, - struct moving_context *ctxt) +int bch2_move_ratelimit(struct moving_context *ctxt) { - struct bch_fs *c = trans->c; + struct bch_fs *c = ctxt->trans->c; u64 delay; - if (ctxt->wait_on_copygc) { - bch2_trans_unlock(trans); + if (ctxt->wait_on_copygc && !c->copygc_running) { + bch2_trans_unlock_long(ctxt->trans); wait_event_killable(c->copygc_running_wq, !c->copygc_running || kthread_should_stop()); @@ -460,8 +494,12 @@ static int move_ratelimit(struct btree_trans *trans, do { delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; + if (delay) { - bch2_trans_unlock(trans); + if (delay > HZ / 10) + bch2_trans_unlock_long(ctxt->trans); + else + bch2_trans_unlock(ctxt->trans); set_current_state(TASK_INTERRUPTIBLE); } @@ -474,7 +512,7 @@ static int move_ratelimit(struct btree_trans *trans, schedule_timeout(delay); if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); @@ -483,7 +521,7 @@ static int move_ratelimit(struct btree_trans *trans, * XXX: these limits really ought to be per device, SSDs and hard drives * will want different limits */ - move_ctxt_wait_event(ctxt, trans, + move_ctxt_wait_event(ctxt, atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && @@ -492,52 +530,28 @@ static int move_ratelimit(struct btree_trans *trans, return 0; } -static int move_get_io_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct bkey_s_c k, u64 *cur_inum) -{ - struct bch_inode_unpacked inode; - int ret; - - if (*cur_inum == k.k->p.inode) - return 0; - - ret = lookup_inode(trans, - SPOS(0, k.k->p.inode, k.k->p.snapshot), - &inode); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - - if (!ret) - bch2_inode_opts_get(io_opts, trans->c, &inode); - else - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); - *cur_inum = k.k->p.inode; - return 0; -} - -static int __bch2_move_data(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id) +static int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id) { - struct bch_fs *c = ctxt->c; - struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct per_snapshot_io_opts snapshot_io_opts; + struct bch_io_opts *io_opts; struct bkey_buf sk; - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct data_update_opts data_opts; - u64 cur_inum = U64_MAX; int ret = 0, ret2; + per_snapshot_io_opts_init(&snapshot_io_opts, c); bch2_bkey_buf_init(&sk); if (ctxt->stats) { ctxt->stats->data_type = BCH_DATA_user; - ctxt->stats->btree_id = btree_id; - ctxt->stats->pos = start; + ctxt->stats->pos = BBPOS(btree_id, start); } bch2_trans_iter_init(trans, &iter, btree_id, start, @@ -547,7 +561,7 @@ static int __bch2_move_data(struct moving_context *ctxt, if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); - while (!move_ratelimit(trans, ctxt)) { + while (!bch2_move_ratelimit(ctxt)) { bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -564,17 +578,18 @@ static int __bch2_move_data(struct moving_context *ctxt, break; if (ctxt->stats) - ctxt->stats->pos = iter.pos; + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); + ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, k, &io_opts, &data_opts)) + if (!pred(c, arg, k, io_opts, &data_opts)) goto next; /* @@ -584,24 +599,20 @@ static int __bch2_move_data(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(trans, &iter, ctxt, NULL, - io_opts, btree_id, k, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt, trans); + bch2_move_ctxt_wait_for_io(ctxt); continue; } /* XXX signal failure */ goto next; } - - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); next: if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); @@ -610,59 +621,68 @@ next_nondata: } bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); + per_snapshot_io_opts_exit(&snapshot_io_opts); return ret; } -int bch2_move_data(struct bch_fs *c, - enum btree_id start_btree_id, struct bpos start_pos, - enum btree_id end_btree_id, struct bpos end_pos, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) +int __bch2_move_data(struct moving_context *ctxt, + struct bbpos start, + struct bbpos end, + move_pred_fn pred, void *arg) { - struct moving_context ctxt; + struct bch_fs *c = ctxt->trans->c; enum btree_id id; int ret = 0; - bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - - for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + for (id = start.btree; + id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); id++) { - stats->btree_id = id; + ctxt->stats->pos = BBPOS(id, POS_MIN); - if (id != BTREE_ID_extents && - id != BTREE_ID_reflink) + if (!btree_type_has_ptrs(id) || + !bch2_btree_id_root(c, id)->b) continue; - if (!bch2_btree_id_root(c, id)->b) - continue; - - ret = __bch2_move_data(&ctxt, - id == start_btree_id ? start_pos : POS_MIN, - id == end_btree_id ? end_pos : POS_MAX, + ret = bch2_move_data_btree(ctxt, + id == start.btree ? start.pos : POS_MIN, + id == end.btree ? end.pos : POS_MAX, pred, arg, id); if (ret) break; } + return ret; +} + +int bch2_move_data(struct bch_fs *c, + struct bbpos start, + struct bbpos end, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + + struct moving_context ctxt; + int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -int __bch2_evacuate_bucket(struct btree_trans *trans, - struct moving_context *ctxt, +int __bch2_evacuate_bucket(struct moving_context *ctxt, struct move_bucket_in_flight *bucket_in_flight, struct bpos bucket, int gen, struct data_update_opts _data_opts) { - struct bch_fs *c = ctxt->c; + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter; struct bkey_buf sk; @@ -673,7 +693,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, struct data_update_opts data_opts; unsigned dirty_sectors, bucket_size; u64 fragmentation; - u64 cur_inum = U64_MAX; struct bpos bp_pos = POS_MIN; int ret = 0; @@ -708,7 +727,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, goto err; } - while (!(ret = move_ratelimit(trans, ctxt))) { + while (!(ret = bch2_move_ratelimit(ctxt))) { bch2_trans_begin(trans); ret = bch2_get_next_backpointer(trans, bucket, gen, @@ -737,7 +756,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; @@ -758,23 +777,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, i++; } - ret = bch2_move_extent(trans, &iter, ctxt, - bucket_in_flight, - io_opts, bp.btree_id, k, data_opts); + ret = bch2_move_extent(ctxt, bucket_in_flight, + &iter, k, io_opts, data_opts); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt, trans); + bch2_move_ctxt_wait_for_io(ctxt); continue; } if (ret) goto err; - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); } else { @@ -825,14 +841,12 @@ int bch2_evacuate_bucket(struct bch_fs *c, struct write_point_specifier wp, bool wait_on_copygc) { - struct btree_trans *trans = bch2_trans_get(c); struct moving_context ctxt; int ret; bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts); + ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); bch2_moving_ctxt_exit(&ctxt); - bch2_trans_put(trans); return ret; } @@ -849,21 +863,25 @@ static int bch2_move_btree(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_trans *trans = bch2_trans_get(c); + struct moving_context ctxt; + struct btree_trans *trans; struct btree_iter iter; struct btree *b; enum btree_id id; struct data_update_opts data_opts; int ret = 0; - progress_list_add(c, stats); + bch2_moving_ctxt_init(&ctxt, c, NULL, stats, + writepoint_ptr(&c->btree_write_point), + true); + trans = ctxt.trans; stats->data_type = BCH_DATA_btree; for (id = start_btree_id; id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); id++) { - stats->btree_id = id; + stats->pos = BBPOS(id, POS_MIN); if (!bch2_btree_id_root(c, id)->b) continue; @@ -882,7 +900,7 @@ retry: bpos_cmp(b->key.k.p, end_pos)) > 0) break; - stats->pos = iter.pos; + stats->pos = BBPOS(iter.btree_id, iter.pos); if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; @@ -904,14 +922,10 @@ next: break; } - bch2_trans_put(trans); - - if (ret) - bch_err_fn(c, ret); - + bch_err_fn(c, ret); + bch2_moving_ctxt_exit(&ctxt); bch2_btree_interior_updates_flush(c); - progress_list_del(c, stats); return ret; } @@ -1032,8 +1046,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) mutex_unlock(&c->sb_lock); } - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -1056,14 +1069,16 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + (struct bbpos) { op.start_btree, op.start_pos }, + (struct bbpos) { op.end_btree, op.end_pos }, NULL, stats, writepoint_hashed((unsigned long) current), true, rereplicate_pred, c) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; + + bch2_move_stats_exit(stats, c); break; case BCH_DATA_OP_MIGRATE: if (op.migrate.dev >= c->sb.nr_devices) @@ -1080,18 +1095,21 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, - op.start_btree, op.start_pos, - op.end_btree, op.end_pos, + (struct bbpos) { op.start_btree, op.start_pos }, + (struct bbpos) { op.end_btree, op.end_pos }, NULL, stats, writepoint_hashed((unsigned long) current), true, migrate_pred, &op) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; + + bch2_move_stats_exit(stats, c); break; case BCH_DATA_OP_REWRITE_OLD_NODES: bch2_move_stats_init(stats, "rewrite_old_nodes"); ret = bch2_scan_old_btree_nodes(c, stats); + bch2_move_stats_exit(stats, c); break; default: ret = -EINVAL; @@ -1100,19 +1118,43 @@ int bch2_data_job(struct bch_fs *c, return ret; } -static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) +void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) { - struct bch_move_stats *stats = ctxt->stats; - struct moving_io *io; + prt_printf(out, "%s: data type=%s pos=", + stats->name, + bch2_data_types[stats->data_type]); + bch2_bbpos_to_text(out, stats->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "keys moved: "); + prt_u64(out, atomic64_read(&stats->keys_moved)); + prt_newline(out); + + prt_str(out, "keys raced: "); + prt_u64(out, atomic64_read(&stats->keys_raced)); + prt_newline(out); + + prt_str(out, "bytes seen: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); + prt_newline(out); - prt_printf(out, "%s (%ps):", stats->name, ctxt->fn); + prt_str(out, "bytes moved: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, " data type %s btree_id %s position: ", - bch2_data_types[stats->data_type], - bch2_btree_ids[stats->btree_id]); - bch2_bpos_to_text(out, stats->pos); + prt_str(out, "bytes raced: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) +{ + struct moving_io *io; + + bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); prt_printf(out, "reads: ios %u/%u sectors %u/%u", @@ -1153,7 +1195,4 @@ void bch2_fs_move_init(struct bch_fs *c) { INIT_LIST_HEAD(&c->moving_context_list); mutex_init(&c->moving_context_lock); - - INIT_LIST_HEAD(&c->data_progress_list); - mutex_init(&c->data_progress_lock); } diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index cbdd58db8782..07cf9d42643b 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MOVE_H #define _BCACHEFS_MOVE_H +#include "bbpos.h" #include "bcachefs_ioctl.h" #include "btree_iter.h" #include "buckets.h" @@ -11,7 +12,7 @@ struct bch_read_bio; struct moving_context { - struct bch_fs *c; + struct btree_trans *trans; struct list_head list; void *fn; @@ -37,13 +38,14 @@ struct moving_context { wait_queue_head_t wait; }; -#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ +#define move_ctxt_wait_event(_ctxt, _cond) \ do { \ bool cond_finished = false; \ - bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \ + bch2_moving_ctxt_do_pending_writes(_ctxt); \ \ if (_cond) \ break; \ + bch2_trans_unlock_long((_ctxt)->trans); \ __wait_event((_ctxt)->wait, \ bch2_moving_ctxt_next_pending_write(_ctxt) || \ (cond_finished = (_cond))); \ @@ -59,22 +61,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool); struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); -void bch2_moving_ctxt_do_pending_writes(struct moving_context *, - struct btree_trans *); +void bch2_moving_ctxt_do_pending_writes(struct moving_context *); +void bch2_move_ctxt_wait_for_io(struct moving_context *); +int bch2_move_ratelimit(struct moving_context *); + +/* Inodes in different snapshots may have different IO options: */ +struct snapshot_io_opts_entry { + u32 snapshot; + struct bch_io_opts io_opts; +}; + +struct per_snapshot_io_opts { + u64 cur_inum; + struct bch_io_opts fs_io_opts; + DARRAY(struct snapshot_io_opts_entry) d; +}; + +static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) +{ + memset(io_opts, 0, sizeof(*io_opts)); + io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); +} + +static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) +{ + darray_exit(&io_opts->d); +} + +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bkey_s_c); +int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); +int bch2_move_extent(struct moving_context *, + struct move_bucket_in_flight *, + struct btree_iter *, + struct bkey_s_c, + struct bch_io_opts, + struct data_update_opts); + +int __bch2_move_data(struct moving_context *, + struct bbpos, + struct bbpos, + move_pred_fn, void *); int bch2_move_data(struct bch_fs *, - enum btree_id, struct bpos, - enum btree_id, struct bpos, + struct bbpos start, + struct bbpos end, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool, move_pred_fn, void *); -int __bch2_evacuate_bucket(struct btree_trans *, - struct moving_context *, +int __bch2_evacuate_bucket(struct moving_context *, struct move_bucket_in_flight *, struct bpos, int, struct data_update_opts); @@ -88,7 +128,10 @@ int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); -void bch2_move_stats_init(struct bch_move_stats *stats, char *name); +void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); +void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); +void bch2_move_stats_init(struct bch_move_stats *, char *); + void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_move_init(struct bch_fs *); diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index baf1f8570b3f..e22841ef31e4 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -2,17 +2,17 @@ #ifndef _BCACHEFS_MOVE_TYPES_H #define _BCACHEFS_MOVE_TYPES_H +#include "bbpos_types.h" + struct bch_move_stats { enum bch_data_type data_type; - enum btree_id btree_id; - struct bpos pos; - struct list_head list; + struct bbpos pos; char name[32]; atomic64_t keys_moved; atomic64_t keys_raced; - atomic64_t sectors_moved; atomic64_t sectors_seen; + atomic64_t sectors_moved; atomic64_t sectors_raced; }; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 4017120baeee..0a0576326c5b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -101,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, return ret; } -static void move_buckets_wait(struct btree_trans *trans, - struct moving_context *ctxt, +static void move_buckets_wait(struct moving_context *ctxt, struct buckets_in_flight *list, bool flush) { @@ -111,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans, while ((i = list->first)) { if (flush) - move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); + move_ctxt_wait_event(ctxt, !atomic_read(&i->count)); if (atomic_read(&i->count)) break; @@ -129,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans, kfree(i); } - bch2_trans_unlock(trans); + bch2_trans_unlock_long(ctxt->trans); } static bool bucket_in_flight(struct buckets_in_flight *list, @@ -140,11 +139,11 @@ static bool bucket_in_flight(struct buckets_in_flight *list, typedef DARRAY(struct move_bucket) move_buckets; -static int bch2_copygc_get_buckets(struct btree_trans *trans, - struct moving_context *ctxt, +static int bch2_copygc_get_buckets(struct moving_context *ctxt, struct buckets_in_flight *buckets_in_flight, move_buckets *buckets) { + struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; @@ -152,7 +151,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; int ret; - move_buckets_wait(trans, ctxt, buckets_in_flight, false); + move_buckets_wait(ctxt, buckets_in_flight, false); ret = bch2_btree_write_buffer_flush(trans); if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", @@ -188,10 +187,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans, } noinline -static int bch2_copygc(struct btree_trans *trans, - struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight) +static int bch2_copygc(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight, + bool *did_work) { + struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct data_update_opts data_opts = { .btree_insert_flags = BCH_WATERMARK_copygc, @@ -202,7 +202,7 @@ static int bch2_copygc(struct btree_trans *trans, u64 moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); if (ret) goto err; @@ -221,10 +221,12 @@ static int bch2_copygc(struct btree_trans *trans, break; } - ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket, + ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, f->bucket.k.gen, data_opts); if (ret) goto err; + + *did_work = true; } err: darray_exit(&buckets); @@ -300,24 +302,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; - struct btree_trans *trans; struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight buckets; + struct buckets_in_flight *buckets; u64 last, wait; int ret = 0; - memset(&buckets, 0, sizeof(buckets)); - - ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); + buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); + if (!buckets) + return -ENOMEM; + ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); if (ret) { + kfree(buckets); bch_err_msg(c, ret, "allocating copygc buckets in flight"); return ret; } set_freezable(); - trans = bch2_trans_get(c); bch2_move_stats_init(&move_stats, "copygc"); bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, @@ -325,16 +327,18 @@ static int bch2_copygc_thread(void *arg) false); while (!ret && !kthread_should_stop()) { - bch2_trans_unlock(trans); + bool did_work = false; + + bch2_trans_unlock_long(ctxt.trans); cond_resched(); if (!c->copy_gc_enabled) { - move_buckets_wait(trans, &ctxt, &buckets, true); + move_buckets_wait(&ctxt, buckets, true); kthread_wait_freezable(c->copy_gc_enabled); } if (unlikely(freezing(current))) { - move_buckets_wait(trans, &ctxt, &buckets, true); + move_buckets_wait(&ctxt, buckets, true); __refrigerator(false); continue; } @@ -345,7 +349,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(trans, &ctxt, &buckets, true); + move_buckets_wait(&ctxt, buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -355,16 +359,29 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(trans, &ctxt, &buckets); + ret = bch2_copygc(&ctxt, buckets, &did_work); c->copygc_running = false; wake_up(&c->copygc_running_wq); + + if (!wait && !did_work) { + u64 min_member_capacity = bch2_min_rw_member_capacity(c); + + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + bch2_trans_unlock_long(ctxt.trans); + bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), + MAX_SCHEDULE_TIMEOUT); + } } - move_buckets_wait(trans, &ctxt, &buckets, true); - rhashtable_destroy(&buckets.table); - bch2_trans_put(trans); + move_buckets_wait(&ctxt, buckets, true); + + rhashtable_destroy(&buckets->table); + kfree(buckets); bch2_moving_ctxt_exit(&ctxt); + bch2_move_stats_exit(&move_stats, c); return 0; } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 232f50c73a94..8dd4046cca41 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -12,11 +12,6 @@ #define x(t, n, ...) [n] = #t, -const char * const bch2_iops_measurements[] = { - BCH_IOPS_MEASUREMENTS() - NULL -}; - const char * const bch2_error_actions[] = { BCH_ERROR_ACTIONS() NULL @@ -42,9 +37,8 @@ const char * const bch2_sb_compat[] = { NULL }; -const char * const bch2_btree_ids[] = { +const char * const __bch2_btree_ids[] = { BCH_BTREE_IDS() - "interior btree node", NULL }; @@ -271,14 +265,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) if (err) prt_printf(err, "%s: too small (min %llu)", opt->attr.name, opt->min); - return -ERANGE; + return -BCH_ERR_ERANGE_option_too_small; } if (opt->max && v >= opt->max) { if (err) prt_printf(err, "%s: too big (max %llu)", opt->attr.name, opt->max); - return -ERANGE; + return -BCH_ERR_ERANGE_option_too_big; } if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { @@ -295,6 +289,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) return -EINVAL; } + if (opt->fn.validate) + return opt->fn.validate(v, err); + return 0; } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 55014336c5f7..8526f177450a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -10,13 +10,12 @@ struct bch_fs; -extern const char * const bch2_iops_measurements[]; extern const char * const bch2_error_actions[]; extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; -extern const char * const bch2_btree_ids[]; +extern const char * const __bch2_btree_ids[]; extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; extern const char * const bch2_compression_types[]; @@ -74,6 +73,7 @@ enum opt_type { struct bch_opt_fn { int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + int (*validate)(u64, struct printbuf *); }; /** diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index de41f9a14492..5e653eb81d54 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -415,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out, while (list[nr]) nr++; - while (flags && (bit = __ffs(flags)) < nr) { + while (flags && (bit = __ffs64(flags)) < nr) { if (!first) bch2_prt_printf(out, ","); first = false; bch2_prt_printf(out, "%s", list[bit]); - flags ^= 1 << bit; + flags ^= BIT_ULL(bit); } } diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index cb68ae44d597..a54647c36b85 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -59,17 +59,18 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { .to_text = bch2_sb_quota_to_text, }; -int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (k.k->p.inode >= QTYP_NR) { - prt_printf(err, "invalid quota type (%llu >= %u)", - k.k->p.inode, QTYP_NR); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, + quota_type_invalid, + "invalid quota type (%llu >= %u)", + k.k->p.inode, QTYP_NR); +fsck_err: + return ret; } void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 2f463874a362..884f601f41c4 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -8,7 +8,7 @@ enum bkey_invalid_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 568f1e8e7507..3319190b8d9c 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -1,15 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" #include "compress.h" #include "disk_groups.h" #include "errcode.h" +#include "error.h" +#include "inode.h" #include "move.h" #include "rebalance.h" +#include "subvolume.h" #include "super-io.h" #include "trace.h" @@ -17,302 +23,396 @@ #include <linux/kthread.h> #include <linux/sched/cputime.h> -/* - * Check if an extent should be moved: - * returns -1 if it should not be moved, or - * device of pointer that should be moved, if known, or INT_MAX if unknown - */ -static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i; - - data_opts->rewrite_ptrs = 0; - data_opts->target = io_opts->background_target; - data_opts->extra_replicas = 0; - data_opts->btree_insert_flags = 0; - - if (io_opts->background_compression && - !bch2_bkey_is_incompressible(k)) { - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - i = 0; - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (!p.ptr.cached && - p.crc.compression_type != - bch2_compression_opt_to_type(io_opts->background_compression)) - data_opts->rewrite_ptrs |= 1U << i; - i++; - } - } +#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) - if (io_opts->background_target) { - const struct bch_extent_ptr *ptr; +static const char * const bch2_rebalance_state_strs[] = { +#define x(t) #t, + BCH_REBALANCE_STATES() + NULL +#undef x +}; - i = 0; - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && - !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && - bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target)) - data_opts->rewrite_ptrs |= 1U << i; - i++; - } - } +static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie *cookie; + u64 v; + int ret; - return data_opts->rewrite_ptrs != 0; + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); + ret = PTR_ERR_OR_ZERO(cookie); + if (ret) + goto err; + + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter.pos; + cookie->v.cookie = cpu_to_le64(v + 1); + + ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } -void bch2_rebalance_add_key(struct bch_fs *c, - struct bkey_s_c k, - struct bch_io_opts *io_opts) +int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { - struct data_update_opts update_opts = { 0 }; - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; - unsigned i; - - if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) - return; - - i = 0; - ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) { - if ((1U << i) && update_opts.rewrite_ptrs) - if (atomic64_add_return(k.k->size, - &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == - k.k->size) - rebalance_wakeup(c); - i++; - } + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + __bch2_set_rebalance_needs_scan(trans, inum)); + rebalance_wakeup(c); + return ret; } -void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +int bch2_set_fs_needs_rebalance(struct bch_fs *c) { - if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == - sectors) - rebalance_wakeup(c); + return bch2_set_rebalance_needs_scan(c, 0); } -struct rebalance_work { - int dev_most_full_idx; - unsigned dev_most_full_percent; - u64 dev_most_full_work; - u64 dev_most_full_capacity; - u64 total_work; -}; +static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 v; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + if (v == cookie) + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} -static void rebalance_work_accumulate(struct rebalance_work *w, - u64 dev_work, u64 unknown_dev, u64 capacity, int idx) +static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, + struct btree_iter *work_iter) { - unsigned percent_full; - u64 work = dev_work + unknown_dev; + return !kthread_should_stop() + ? bch2_btree_iter_peek(work_iter) + : bkey_s_c_null; +} - /* avoid divide by 0 */ - if (!capacity) - return; +static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + extent_entry_drop(bkey_i_to_s(n), + (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); + return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + +static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct bpos work_pos, + struct btree_iter *extent_iter, + struct data_update_opts *data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, + work_pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek_slot(extent_iter); + if (bkey_err(k)) + return k; + + const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; + if (!r) { + /* raced due to btree write buffer, nothing to do */ + return bkey_s_c_null; + } - if (work < dev_work || work < unknown_dev) - work = U64_MAX; - work = min(work, capacity); + memset(data_opts, 0, sizeof(*data_opts)); - percent_full = div64_u64(work * 100, capacity); + data_opts->rewrite_ptrs = + bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); + data_opts->target = r->target; - if (percent_full >= w->dev_most_full_percent) { - w->dev_most_full_idx = idx; - w->dev_most_full_percent = percent_full; - w->dev_most_full_work = work; - w->dev_most_full_capacity = capacity; + if (!data_opts->rewrite_ptrs) { + /* + * device we would want to write to offline? devices in target + * changed? + * + * We'll now need a full scan before this extent is picked up + * again: + */ + int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); + return bkey_s_c_null; } - if (w->total_work + dev_work >= w->total_work && - w->total_work + dev_work >= dev_work) - w->total_work += dev_work; + return k; } -static struct rebalance_work rebalance_work(struct bch_fs *c) +noinline_for_stack +static int do_rebalance_extent(struct moving_context *ctxt, + struct bpos work_pos, + struct btree_iter *extent_iter) { - struct bch_dev *ca; - struct rebalance_work ret = { .dev_most_full_idx = -1 }; - u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); - unsigned i; - - for_each_online_member(ca, c, i) - rebalance_work_accumulate(&ret, - atomic64_read(&ca->rebalance_work), - unknown_dev, - bucket_to_sector(ca, ca->mi.nbuckets - - ca->mi.first_bucket), - i); - - rebalance_work_accumulate(&ret, - unknown_dev, 0, c->capacity, -1); + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &trans->c->rebalance; + struct data_update_opts data_opts; + struct bch_io_opts io_opts; + struct bkey_s_c k; + struct bkey_buf sk; + int ret; + + ctxt->stats = &r->work_stats; + r->state = BCH_REBALANCE_working; + + bch2_bkey_buf_init(&sk); + + ret = bkey_err(k = next_rebalance_extent(trans, work_pos, + extent_iter, &data_opts)); + if (ret || !k.k) + goto out; + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + if (ret) + goto out; + + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); + if (ret) { + if (bch2_err_matches(ret, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + ret = -BCH_ERR_transaction_restart_nested; + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto out; + + /* skip it and continue, XXX signal failure */ + ret = 0; + } +out: + bch2_bkey_buf_exit(&sk, c); return ret; } -static void rebalance_work_reset(struct bch_fs *c) +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { - struct bch_dev *ca; - unsigned i; + unsigned target, compression; + + if (k.k->p.inode) { + target = io_opts->background_target; + compression = io_opts->background_compression ?: io_opts->compression; + } else { + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - for_each_online_member(ca, c, i) - atomic64_set(&ca->rebalance_work, 0); + target = r ? r->target : io_opts->background_target; + compression = r ? r->compression : + (io_opts->background_compression ?: io_opts->compression); + } - atomic64_set(&c->rebalance.work_unknown_dev, 0); + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); + data_opts->target = target; + return data_opts->rewrite_ptrs != 0; } -static unsigned long curr_cputime(void) +static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) { - u64 utime, stime; + struct btree_trans *trans = ctxt->trans; + struct bch_fs_rebalance *r = &trans->c->rebalance; + int ret; + + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + ctxt->stats = &r->scan_stats; - task_cputime_adjusted(current, &utime, &stime); - return nsecs_to_jiffies(utime + stime); + if (!inum) { + r->scan_start = BBPOS_MIN; + r->scan_end = BBPOS_MAX; + } else { + r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); + r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); + } + + r->state = BCH_REBALANCE_scanning; + + ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: + commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + bch2_move_stats_exit(&r->scan_stats, trans->c); + return ret; } -static int bch2_rebalance_thread(void *arg) +static void rebalance_wait(struct bch_fs *c) { - struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; struct io_clock *clock = &c->io_clock[WRITE]; - struct rebalance_work w, p; - struct bch_move_stats move_stats; - unsigned long start, prev_start; - unsigned long prev_run_time, prev_run_cputime; - unsigned long cputime, prev_cputime; - u64 io_start; - long throttle; + u64 now = atomic64_read(&clock->now); + u64 min_member_capacity = bch2_min_rw_member_capacity(c); - set_freezable(); + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + r->wait_iotime_end = now + (min_member_capacity >> 6); - io_start = atomic64_read(&clock->now); - p = rebalance_work(c); - prev_start = jiffies; - prev_cputime = curr_cputime(); + if (r->state != BCH_REBALANCE_waiting) { + r->wait_iotime_start = now; + r->wait_wallclock_start = ktime_get_real_ns(); + r->state = BCH_REBALANCE_waiting; + } + + bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); +} - bch2_move_stats_init(&move_stats, "rebalance"); - while (!kthread_wait_freezable(r->enabled)) { - cond_resched(); +static int do_rebalance(struct moving_context *ctxt) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + struct btree_iter rebalance_work_iter, extent_iter = { NULL }; + struct bkey_s_c k; + int ret = 0; - start = jiffies; - cputime = curr_cputime(); + bch2_move_stats_init(&r->work_stats, "rebalance_work"); + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); - prev_run_time = start - prev_start; - prev_run_cputime = cputime - prev_cputime; + bch2_trans_iter_init(trans, &rebalance_work_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS); - w = rebalance_work(c); - BUG_ON(!w.dev_most_full_capacity); + while (!bch2_move_ratelimit(ctxt) && + !kthread_wait_freezable(r->enabled)) { + bch2_trans_begin(trans); - if (!w.total_work) { - r->state = REBALANCE_WAITING; - kthread_wait_freezable(rebalance_work(c).total_work); + ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; - } + if (ret || !k.k) + break; - /* - * If there isn't much work to do, throttle cpu usage: - */ - throttle = prev_run_cputime * 100 / - max(1U, w.dev_most_full_percent) - - prev_run_time; - - if (w.dev_most_full_percent < 20 && throttle > 0) { - r->throttled_until_iotime = io_start + - div_u64(w.dev_most_full_capacity * - (20 - w.dev_most_full_percent), - 50); - - if (atomic64_read(&clock->now) + clock->max_slop < - r->throttled_until_iotime) { - r->throttled_until_cputime = start + throttle; - r->state = REBALANCE_THROTTLED; - - bch2_kthread_io_clock_wait(clock, - r->throttled_until_iotime, - throttle); - continue; - } - } + ret = k.k->type == KEY_TYPE_cookie + ? do_rebalance_scan(ctxt, k.k->p.inode, + le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) + : do_rebalance_extent(ctxt, k.k->p, &extent_iter); - /* minimum 1 mb/sec: */ - r->pd.rate.rate = - max_t(u64, 1 << 11, - r->pd.rate.rate * - max(p.dev_most_full_percent, 1U) / - max(w.dev_most_full_percent, 1U)); - - io_start = atomic64_read(&clock->now); - p = w; - prev_start = start; - prev_cputime = cputime; - - r->state = REBALANCE_RUNNING; - memset(&move_stats, 0, sizeof(move_stats)); - rebalance_work_reset(c); - - bch2_move_data(c, - 0, POS_MIN, - BTREE_ID_NR, POS_MAX, - /* ratelimiting disabled for now */ - NULL, /* &r->pd.rate, */ - &move_stats, - writepoint_ptr(&c->rebalance_write_point), - true, - rebalance_pred, NULL); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_btree_iter_advance(&rebalance_work_iter); } - return 0; + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_work_iter); + bch2_move_stats_exit(&r->scan_stats, c); + + if (!ret && + !kthread_should_stop() && + !atomic64_read(&r->work_stats.sectors_seen) && + !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_trans_unlock_long(trans); + rebalance_wait(c); + } + + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); + return ret; } -void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) +static int bch2_rebalance_thread(void *arg) { + struct bch_fs *c = arg; struct bch_fs_rebalance *r = &c->rebalance; - struct rebalance_work w = rebalance_work(c); + struct moving_context ctxt; + int ret; - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 20); + set_freezable(); - prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); - prt_tab(out); + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, + writepoint_ptr(&c->rebalance_write_point), + true); - prt_human_readable_u64(out, w.dev_most_full_work << 9); - prt_printf(out, "/"); - prt_human_readable_u64(out, w.dev_most_full_capacity << 9); - prt_newline(out); + while (!kthread_should_stop() && + !(ret = do_rebalance(&ctxt))) + ; - prt_printf(out, "total work:"); - prt_tab(out); + bch2_moving_ctxt_exit(&ctxt); - prt_human_readable_u64(out, w.total_work << 9); - prt_printf(out, "/"); - prt_human_readable_u64(out, c->capacity << 9); - prt_newline(out); + return 0; +} + +void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; - prt_printf(out, "rate:"); - prt_tab(out); - prt_printf(out, "%u", r->pd.rate.rate); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); + printbuf_indent_add(out, 2); switch (r->state) { - case REBALANCE_WAITING: - prt_printf(out, "waiting"); + case BCH_REBALANCE_waiting: { + u64 now = atomic64_read(&c->io_clock[WRITE].now); + + prt_str(out, "io wait duration: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + prt_newline(out); + + prt_str(out, "io wait remaining: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + prt_newline(out); + + prt_str(out, "duration waited: "); + bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); + prt_newline(out); break; - case REBALANCE_THROTTLED: - prt_printf(out, "throttled for %lu sec or ", - (r->throttled_until_cputime - jiffies) / HZ); - prt_human_readable_u64(out, - (r->throttled_until_iotime - - atomic64_read(&c->io_clock[WRITE].now)) << 9); - prt_printf(out, " io"); + } + case BCH_REBALANCE_working: + bch2_move_stats_to_text(out, &r->work_stats); break; - case REBALANCE_RUNNING: - prt_printf(out, "running"); + case BCH_REBALANCE_scanning: + bch2_move_stats_to_text(out, &r->scan_stats); break; } prt_newline(out); + printbuf_indent_sub(out, 2); } void bch2_rebalance_stop(struct bch_fs *c) @@ -361,6 +461,4 @@ int bch2_rebalance_start(struct bch_fs *c) void bch2_fs_rebalance_init(struct bch_fs *c) { bch2_pd_controller_init(&c->rebalance.pd); - - atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); } diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 7ade0bb81cce..28a52638f16c 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -4,6 +4,9 @@ #include "rebalance_types.h" +int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); +int bch2_set_fs_needs_rebalance(struct bch_fs *); + static inline void rebalance_wakeup(struct bch_fs *c) { struct task_struct *p; @@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c) rcu_read_unlock(); } -void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, - struct bch_io_opts *); -void bch2_rebalance_add_work(struct bch_fs *, u64); - -void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); +void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 7462a92e9598..0fffb536c1d0 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -2,25 +2,36 @@ #ifndef _BCACHEFS_REBALANCE_TYPES_H #define _BCACHEFS_REBALANCE_TYPES_H +#include "bbpos_types.h" #include "move_types.h" -enum rebalance_state { - REBALANCE_WAITING, - REBALANCE_THROTTLED, - REBALANCE_RUNNING, +#define BCH_REBALANCE_STATES() \ + x(waiting) \ + x(working) \ + x(scanning) + +enum bch_rebalance_states { +#define x(t) BCH_REBALANCE_##t, + BCH_REBALANCE_STATES() +#undef x }; struct bch_fs_rebalance { - struct task_struct __rcu *thread; + struct task_struct __rcu *thread; struct bch_pd_controller pd; - atomic64_t work_unknown_dev; + enum bch_rebalance_states state; + u64 wait_iotime_start; + u64 wait_iotime_end; + u64 wait_wallclock_start; + + struct bch_move_stats work_stats; - enum rebalance_state state; - u64 throttled_until_iotime; - unsigned long throttled_until_cputime; + struct bbpos scan_start; + struct bbpos scan_end; + struct bch_move_stats scan_stats; - unsigned enabled:1; + unsigned enabled:1; }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 4cd660650e5b..9c30500ce920 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -23,6 +23,7 @@ #include "logged_ops.h" #include "move.h" #include "quota.h" +#include "rebalance.h" #include "recovery.h" #include "replicas.h" #include "sb-clean.h" @@ -182,7 +183,7 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_journal_replay_key(trans, k)); if (ret) { bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", - bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret)); + bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); goto err; } } @@ -225,7 +226,7 @@ static int journal_replay_entry_early(struct bch_fs *c, if (entry->u64s) { r->level = entry->level; - bkey_copy(&r->key, &entry->start[0]); + bkey_copy(&r->key, (struct bkey_i *) entry->start); r->error = 0; } else { r->error = -EIO; @@ -364,10 +365,12 @@ static int read_btree_roots(struct bch_fs *c) } if (r->error) { - __fsck_err(c, btree_id_is_alloc(i) + __fsck_err(c, + btree_id_is_alloc(i) ? FSCK_CAN_IGNORE : 0, + btree_root_bkey_invalid, "invalid btree root %s", - bch2_btree_ids[i]); + bch2_btree_id_str(i)); if (i == BTREE_ID_alloc) c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } @@ -375,8 +378,9 @@ static int read_btree_roots(struct bch_fs *c) ret = bch2_btree_root_read(c, i, &r->key, r->level); if (ret) { fsck_err(c, + btree_root_read_error, "error reading btree root %s", - bch2_btree_ids[i]); + bch2_btree_id_str(i)); if (btree_id_is_alloc(i)) c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ret = 0; @@ -713,6 +717,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (mustfix_fsck_err_on(c->sb.clean && last_journal_entry && !journal_entry_empty(last_journal_entry), c, + clean_but_journal_not_empty, "filesystem marked clean but journal not empty")) { c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); @@ -720,7 +725,9 @@ int bch2_fs_recovery(struct bch_fs *c) } if (!last_journal_entry) { - fsck_err_on(!c->sb.clean, c, "no journal entries found"); + fsck_err_on(!c->sb.clean, c, + dirty_but_no_journal_entries, + "no journal entries found"); if (clean) goto use_clean; @@ -728,6 +735,13 @@ int bch2_fs_recovery(struct bch_fs *c) if (*i) { last_journal_entry = &(*i)->j; (*i)->ignore = false; + /* + * This was probably a NO_FLUSH entry, + * so last_seq was garbage - but we know + * we're only using a single journal + * entry, set it here: + */ + (*i)->j.last_seq = (*i)->j.seq; break; } } @@ -901,7 +915,7 @@ out: } kfree(clean); - if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { + if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { bch2_fs_read_write_early(c); bch2_delete_dead_snapshots_async(c); } @@ -946,16 +960,12 @@ int bch2_fs_initialize(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - for_each_online_member(ca, c, i) + for_each_member_device(ca, c, i) bch2_dev_usage_init(ca); - for_each_online_member(ca, c, i) { - ret = bch2_dev_journal_alloc(ca); - if (ret) { - percpu_ref_put(&ca->io_ref); - goto err; - } - } + ret = bch2_fs_journal_alloc(c); + if (ret) + goto err; /* * journal_res_get() will crash if called before this has @@ -973,15 +983,13 @@ int bch2_fs_initialize(struct bch_fs *c) * btree updates */ bch_verbose(c, "marking superblocks"); - for_each_member_device(ca, c, i) { - ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) { - percpu_ref_put(&ca->ref); - goto err; - } + ret = bch2_trans_mark_dev_sbs(c); + bch_err_msg(c, ret, "marking superblocks"); + if (ret) + goto err; + for_each_online_member(ca, c, i) ca->new_fs_bucket_idx = 0; - } ret = bch2_fs_freespace_init(c); if (ret) diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index fbfa9d831d6f..515e3d62c2ac 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -14,6 +14,8 @@ x(snapshots_read, PASS_ALWAYS) \ x(check_topology, 0) \ x(check_allocations, PASS_FSCK) \ + x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \ x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ x(journal_replay, PASS_ALWAYS) \ x(check_alloc_info, PASS_FSCK) \ @@ -27,11 +29,12 @@ x(check_snapshot_trees, PASS_FSCK) \ x(check_snapshots, PASS_FSCK) \ x(check_subvols, PASS_FSCK) \ - x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \ + x(delete_dead_snapshots, PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 0) \ x(resume_logged_ops, PASS_ALWAYS) \ x(check_inodes, PASS_FSCK) \ x(check_extents, PASS_FSCK) \ + x(check_indirect_extents, PASS_FSCK) \ x(check_dirents, PASS_FSCK) \ x(check_xattrs, PASS_FSCK) \ x(check_root, PASS_FSCK) \ @@ -39,6 +42,7 @@ x(check_nlinks, PASS_FSCK) \ x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 0) \ + x(set_fs_needs_rebalance, 0) \ enum bch_recovery_pass { #define x(n, when) BCH_RECOVERY_PASS_##n, diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index d77d0ea9afff..6e1bfe9feb59 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -7,6 +7,7 @@ #include "inode.h" #include "io_misc.h" #include "io_write.h" +#include "rebalance.h" #include "reflink.h" #include "subvolume.h" #include "super-io.h" @@ -27,7 +28,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ -int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -74,7 +75,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r /* indirect extents */ -int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -103,28 +104,29 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } #endif +static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) +{ + if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { + new->k.type = KEY_TYPE_deleted; + new->k.size = 0; + set_bkey_val_u64s(&new->k, 0);; + *flags &= ~BTREE_TRIGGER_INSERT; + } +} + int bch2_trans_mark_reflink_v(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { - if (!(flags & BTREE_TRIGGER_OVERWRITE)) { - struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); - - if (!r->v.refcount) { - r->k.type = KEY_TYPE_deleted; - r->k.size = 0; - set_bkey_val_u64s(&r->k, 0); - return 0; - } - } + check_indirect_extent_deleting(new, &flags); return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); } /* indirect inline data */ -int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { @@ -132,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, } void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) + struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); unsigned datalen = bkey_inline_data_bytes(k.k); @@ -147,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { - if (!(flags & BTREE_TRIGGER_OVERWRITE)) { - struct bkey_i_indirect_inline_data *r = - bkey_i_to_indirect_inline_data(new); - - if (!r->v.refcount) { - r->k.type = KEY_TYPE_deleted; - r->k.size = 0; - set_bkey_val_u64s(&r->k, 0); - } - } + check_indirect_extent_deleting(new, &flags); return 0; } @@ -260,8 +253,9 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos dst_start = POS(dst_inum.inum, dst_offset); struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; + struct bch_io_opts opts; struct bpos src_want; - u64 dst_done; + u64 dst_done = 0; u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; @@ -277,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_src); trans = bch2_trans_get(c); + ret = bch2_inum_opts_get(trans, src_inum, &opts); + if (ret) + goto err; + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, BTREE_ITER_INTENT); bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, @@ -360,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_extent_update(trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, - new_i_size, i_sectors_delta, - true); + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, + opts.background_target, + opts.background_compression) ?: + bch2_extent_update(trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, + true); bch2_disk_reservation_put(c, &disk_res); } bch2_trans_iter_exit(trans, &dst_iter); @@ -394,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_iter_exit(trans, &inode_iter); } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); - +err: bch2_trans_put(trans); bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index fe52538efb52..8ccf3f9c4939 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -4,7 +4,7 @@ enum bkey_invalid_flags; -int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -19,7 +19,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .min_val_size = 16, \ }) -int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -35,7 +35,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, .min_val_size = 8, \ }) -int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index cef2a0447b86..1c3ae13bfced 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -462,18 +462,13 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) { lockdep_assert_held(&c->replicas_gc_lock); - if (ret) - goto err; - mutex_lock(&c->sb_lock); percpu_down_write(&c->mark_lock); - ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); - if (ret) - goto err; + ret = ret ?: + bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: + replicas_table_update(c, &c->replicas_gc); - ret = replicas_table_update(c, &c->replicas_gc); -err: kfree(c->replicas_gc.entries); c->replicas_gc.entries = NULL; @@ -579,12 +574,9 @@ retry: bch2_cpu_replicas_sort(&new); - ret = bch2_cpu_replicas_to_sb_replicas(c, &new); - if (ret) - goto err; + ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: + replicas_table_update(c, &new); - ret = replicas_table_update(c, &new); -err: kfree(new.entries); percpu_up_write(&c->mark_lock); diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 61203d7c8d36..e151ada1c8bd 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -82,6 +82,7 @@ int bch2_verify_superblock_clean(struct bch_fs *c, int ret = 0; if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + sb_clean_journal_seq_mismatch, "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", le64_to_cpu(clean->journal_seq), le64_to_cpu(j->seq))) { @@ -119,6 +120,7 @@ int bch2_verify_superblock_clean(struct bch_fs *c, k1->k.u64s != k2->k.u64s || memcmp(k1, k2, bkey_bytes(&k1->k)) || l1 != l2, c, + sb_clean_btree_root_mismatch, "superblock btree root %u doesn't match journal after clean shutdown\n" "sb: l=%u %s\n" "journal: l=%u %s\n", i, @@ -140,6 +142,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); if (fsck_err_on(!sb_clean, c, + sb_clean_missing, "superblock marked clean but clean section not present")) { SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; @@ -373,7 +376,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) entry = sb_clean->start; bch2_journal_super_entries_add_common(c, &entry, 0); - entry = bch2_btree_roots_to_journal_entries(c, entry, entry); + entry = bch2_btree_roots_to_journal_entries(c, entry, 0); BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); memset(entry, 0, diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c new file mode 100644 index 000000000000..f0930ab7f036 --- /dev/null +++ b/fs/bcachefs/sb-errors.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "sb-errors.h" +#include "super-io.h" + +static const char * const bch2_sb_error_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_SB_ERRS() + NULL +}; + +static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) +{ + if (id < BCH_SB_ERR_MAX) + prt_str(out, bch2_sb_error_strs[id]); + else + prt_printf(out, "(unknown error %u)", id); +} + +static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) +{ + return e + ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0]) + : 0; +} + +static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) +{ + return (sizeof(struct bch_sb_field_errors) + + sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); +} + +static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_errors *e = field_to_type(f, errors); + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); + + for (i = 0; i < nr; i++) { + if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { + prt_printf(err, "entry with count 0 (id "); + bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); + prt_printf(err, ")"); + return -BCH_ERR_invalid_sb_errors; + } + + if (i + 1 < nr && + BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= + BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { + prt_printf(err, "entries out of order"); + return -BCH_ERR_invalid_sb_errors; + } + } + + return 0; +} + +static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_errors *e = field_to_type(f, errors); + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); + + if (out->nr_tabstops <= 1) + printbuf_tabstop_push(out, 16); + + for (i = 0; i < nr; i++) { + bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); + prt_tab(out); + prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); + prt_tab(out); + bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_errors = { + .validate = bch2_sb_errors_validate, + .to_text = bch2_sb_errors_to_text, +}; + +void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) +{ + bch_sb_errors_cpu *e = &c->fsck_error_counts; + struct bch_sb_error_entry_cpu n = { + .id = err, + .nr = 1, + .last_error_time = ktime_get_real_seconds() + }; + unsigned i; + + mutex_lock(&c->fsck_error_counts_lock); + for (i = 0; i < e->nr; i++) { + if (err == e->data[i].id) { + e->data[i].nr++; + e->data[i].last_error_time = n.last_error_time; + goto out; + } + if (err < e->data[i].id) + break; + } + + if (darray_make_room(e, 1)) + goto out; + + darray_insert_item(e, i, n); +out: + mutex_unlock(&c->fsck_error_counts_lock); +} + +void bch2_sb_errors_from_cpu(struct bch_fs *c) +{ + bch_sb_errors_cpu *src = &c->fsck_error_counts; + struct bch_sb_field_errors *dst = + bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + unsigned i; + + if (!dst) + return; + + for (i = 0; i < src->nr; i++) { + SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); + SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); + dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); + } +} + +static int bch2_sb_errors_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); + bch_sb_errors_cpu *dst = &c->fsck_error_counts; + unsigned i, nr = bch2_sb_field_errors_nr_entries(src); + int ret; + + if (!nr) + return 0; + + mutex_lock(&c->fsck_error_counts_lock); + ret = darray_make_room(dst, nr); + if (ret) + goto err; + + dst->nr = nr; + + for (i = 0; i < nr; i++) { + dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); + dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); + dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); + } +err: + mutex_unlock(&c->fsck_error_counts_lock); + + return ret; +} + +void bch2_fs_sb_errors_exit(struct bch_fs *c) +{ + darray_exit(&c->fsck_error_counts); +} + +void bch2_fs_sb_errors_init_early(struct bch_fs *c) +{ + mutex_init(&c->fsck_error_counts_lock); + darray_init(&c->fsck_error_counts); +} + +int bch2_fs_sb_errors_init(struct bch_fs *c) +{ + return bch2_sb_errors_to_cpu(c); +} diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h new file mode 100644 index 000000000000..5a09a53966be --- /dev/null +++ b/fs/bcachefs/sb-errors.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_H +#define _BCACHEFS_SB_ERRORS_H + +#include "sb-errors_types.h" + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0) \ + x(dirty_but_no_journal_entries, 1) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ + x(sb_clean_journal_seq_mismatch, 3) \ + x(sb_clean_btree_root_mismatch, 4) \ + x(sb_clean_missing, 5) \ + x(jset_unsupported_version, 6) \ + x(jset_unknown_csum, 7) \ + x(jset_last_seq_newer_than_seq, 8) \ + x(jset_past_bucket_end, 9) \ + x(jset_seq_blacklisted, 10) \ + x(journal_entries_missing, 11) \ + x(journal_entry_replicas_not_marked, 12) \ + x(journal_entry_past_jset_end, 13) \ + x(journal_entry_replicas_data_mismatch, 14) \ + x(journal_entry_bkey_u64s_0, 15) \ + x(journal_entry_bkey_past_end, 16) \ + x(journal_entry_bkey_bad_format, 17) \ + x(journal_entry_bkey_invalid, 18) \ + x(journal_entry_btree_root_bad_size, 19) \ + x(journal_entry_blacklist_bad_size, 20) \ + x(journal_entry_blacklist_v2_bad_size, 21) \ + x(journal_entry_blacklist_v2_start_past_end, 22) \ + x(journal_entry_usage_bad_size, 23) \ + x(journal_entry_data_usage_bad_size, 24) \ + x(journal_entry_clock_bad_size, 25) \ + x(journal_entry_clock_bad_rw, 26) \ + x(journal_entry_dev_usage_bad_size, 27) \ + x(journal_entry_dev_usage_bad_dev, 28) \ + x(journal_entry_dev_usage_bad_pad, 29) \ + x(btree_node_unreadable, 30) \ + x(btree_node_fault_injected, 31) \ + x(btree_node_bad_magic, 32) \ + x(btree_node_bad_seq, 33) \ + x(btree_node_unsupported_version, 34) \ + x(btree_node_bset_older_than_sb_min, 35) \ + x(btree_node_bset_newer_than_sb, 36) \ + x(btree_node_data_missing, 37) \ + x(btree_node_bset_after_end, 38) \ + x(btree_node_replicas_sectors_written_mismatch, 39) \ + x(btree_node_replicas_data_mismatch, 40) \ + x(bset_unknown_csum, 41) \ + x(bset_bad_csum, 42) \ + x(bset_past_end_of_btree_node, 43) \ + x(bset_wrong_sector_offset, 44) \ + x(bset_empty, 45) \ + x(bset_bad_seq, 46) \ + x(bset_blacklisted_journal_seq, 47) \ + x(first_bset_blacklisted_journal_seq, 48) \ + x(btree_node_bad_btree, 49) \ + x(btree_node_bad_level, 50) \ + x(btree_node_bad_min_key, 51) \ + x(btree_node_bad_max_key, 52) \ + x(btree_node_bad_format, 53) \ + x(btree_node_bkey_past_bset_end, 54) \ + x(btree_node_bkey_bad_format, 55) \ + x(btree_node_bad_bkey, 56) \ + x(btree_node_bkey_out_of_order, 57) \ + x(btree_root_bkey_invalid, 58) \ + x(btree_root_read_error, 59) \ + x(btree_root_bad_min_key, 50) \ + x(btree_root_bad_max_key, 61) \ + x(btree_node_read_error, 62) \ + x(btree_node_topology_bad_min_key, 63) \ + x(btree_node_topology_bad_max_key, 64) \ + x(btree_node_topology_overwritten_by_prev_node, 65) \ + x(btree_node_topology_overwritten_by_next_node, 66) \ + x(btree_node_topology_interior_node_empty, 67) \ + x(fs_usage_hidden_wrong, 68) \ + x(fs_usage_btree_wrong, 69) \ + x(fs_usage_data_wrong, 70) \ + x(fs_usage_cached_wrong, 71) \ + x(fs_usage_reserved_wrong, 72) \ + x(fs_usage_persistent_reserved_wrong, 73) \ + x(fs_usage_nr_inodes_wrong, 74) \ + x(fs_usage_replicas_wrong, 75) \ + x(dev_usage_buckets_wrong, 76) \ + x(dev_usage_sectors_wrong, 77) \ + x(dev_usage_fragmented_wrong, 78) \ + x(dev_usage_buckets_ec_wrong, 79) \ + x(bkey_version_in_future, 80) \ + x(bkey_u64s_too_small, 81) \ + x(bkey_invalid_type_for_btree, 82) \ + x(bkey_extent_size_zero, 83) \ + x(bkey_extent_size_greater_than_offset, 84) \ + x(bkey_size_nonzero, 85) \ + x(bkey_snapshot_nonzero, 86) \ + x(bkey_snapshot_zero, 87) \ + x(bkey_at_pos_max, 88) \ + x(bkey_before_start_of_btree_node, 89) \ + x(bkey_after_end_of_btree_node, 90) \ + x(bkey_val_size_nonzero, 91) \ + x(bkey_val_size_too_small, 92) \ + x(alloc_v1_val_size_bad, 93) \ + x(alloc_v2_unpack_error, 94) \ + x(alloc_v3_unpack_error, 95) \ + x(alloc_v4_val_size_bad, 96) \ + x(alloc_v4_backpointers_start_bad, 97) \ + x(alloc_key_data_type_bad, 98) \ + x(alloc_key_empty_but_have_data, 99) \ + x(alloc_key_dirty_sectors_0, 100) \ + x(alloc_key_data_type_inconsistency, 101) \ + x(alloc_key_to_missing_dev_bucket, 102) \ + x(alloc_key_cached_inconsistency, 103) \ + x(alloc_key_cached_but_read_time_zero, 104) \ + x(alloc_key_to_missing_lru_entry, 105) \ + x(alloc_key_data_type_wrong, 106) \ + x(alloc_key_gen_wrong, 107) \ + x(alloc_key_dirty_sectors_wrong, 108) \ + x(alloc_key_cached_sectors_wrong, 109) \ + x(alloc_key_stripe_wrong, 110) \ + x(alloc_key_stripe_redundancy_wrong, 111) \ + x(bucket_sector_count_overflow, 112) \ + x(bucket_metadata_type_mismatch, 113) \ + x(need_discard_key_wrong, 114) \ + x(freespace_key_wrong, 115) \ + x(freespace_hole_missing, 116) \ + x(bucket_gens_val_size_bad, 117) \ + x(bucket_gens_key_wrong, 118) \ + x(bucket_gens_hole_wrong, 119) \ + x(bucket_gens_to_invalid_dev, 120) \ + x(bucket_gens_to_invalid_buckets, 121) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ + x(need_discard_freespace_key_bad, 124) \ + x(backpointer_pos_wrong, 125) \ + x(backpointer_to_missing_device, 126) \ + x(backpointer_to_missing_alloc, 127) \ + x(backpointer_to_missing_ptr, 128) \ + x(lru_entry_at_time_0, 129) \ + x(lru_entry_to_invalid_bucket, 130) \ + x(lru_entry_bad, 131) \ + x(btree_ptr_val_too_big, 132) \ + x(btree_ptr_v2_val_too_big, 133) \ + x(btree_ptr_has_non_ptr, 134) \ + x(extent_ptrs_invalid_entry, 135) \ + x(extent_ptrs_no_ptrs, 136) \ + x(extent_ptrs_too_many_ptrs, 137) \ + x(extent_ptrs_redundant_crc, 138) \ + x(extent_ptrs_redundant_stripe, 139) \ + x(extent_ptrs_unwritten, 140) \ + x(extent_ptrs_written_and_unwritten, 141) \ + x(ptr_to_invalid_device, 142) \ + x(ptr_to_duplicate_device, 143) \ + x(ptr_after_last_bucket, 144) \ + x(ptr_before_first_bucket, 145) \ + x(ptr_spans_multiple_buckets, 146) \ + x(ptr_to_missing_backpointer, 147) \ + x(ptr_to_missing_alloc_key, 148) \ + x(ptr_to_missing_replicas_entry, 149) \ + x(ptr_to_missing_stripe, 150) \ + x(ptr_to_incorrect_stripe, 151) \ + x(ptr_gen_newer_than_bucket_gen, 152) \ + x(ptr_too_stale, 153) \ + x(stale_dirty_ptr, 154) \ + x(ptr_bucket_data_type_mismatch, 155) \ + x(ptr_cached_and_erasure_coded, 156) \ + x(ptr_crc_uncompressed_size_too_small, 157) \ + x(ptr_crc_csum_type_unknown, 158) \ + x(ptr_crc_compression_type_unknown, 159) \ + x(ptr_crc_redundant, 160) \ + x(ptr_crc_uncompressed_size_too_big, 161) \ + x(ptr_crc_nonce_mismatch, 162) \ + x(ptr_stripe_redundant, 163) \ + x(reservation_key_nr_replicas_invalid, 164) \ + x(reflink_v_refcount_wrong, 165) \ + x(reflink_p_to_missing_reflink_v, 166) \ + x(stripe_pos_bad, 167) \ + x(stripe_val_size_bad, 168) \ + x(stripe_sector_count_wrong, 169) \ + x(snapshot_tree_pos_bad, 170) \ + x(snapshot_tree_to_missing_snapshot, 171) \ + x(snapshot_tree_to_missing_subvol, 172) \ + x(snapshot_tree_to_wrong_subvol, 173) \ + x(snapshot_tree_to_snapshot_subvol, 174) \ + x(snapshot_pos_bad, 175) \ + x(snapshot_parent_bad, 176) \ + x(snapshot_children_not_normalized, 177) \ + x(snapshot_child_duplicate, 178) \ + x(snapshot_child_bad, 179) \ + x(snapshot_skiplist_not_normalized, 180) \ + x(snapshot_skiplist_bad, 181) \ + x(snapshot_should_not_have_subvol, 182) \ + x(snapshot_to_bad_snapshot_tree, 183) \ + x(snapshot_bad_depth, 184) \ + x(snapshot_bad_skiplist, 185) \ + x(subvol_pos_bad, 186) \ + x(subvol_not_master_and_not_snapshot, 187) \ + x(subvol_to_missing_root, 188) \ + x(subvol_root_wrong_bi_subvol, 189) \ + x(bkey_in_missing_snapshot, 190) \ + x(inode_pos_inode_nonzero, 191) \ + x(inode_pos_blockdev_range, 192) \ + x(inode_unpack_error, 193) \ + x(inode_str_hash_invalid, 194) \ + x(inode_v3_fields_start_bad, 195) \ + x(inode_snapshot_mismatch, 196) \ + x(inode_unlinked_but_clean, 197) \ + x(inode_unlinked_but_nlink_nonzero, 198) \ + x(inode_checksum_type_invalid, 199) \ + x(inode_compression_type_invalid, 200) \ + x(inode_subvol_root_but_not_dir, 201) \ + x(inode_i_size_dirty_but_clean, 202) \ + x(inode_i_sectors_dirty_but_clean, 203) \ + x(inode_i_sectors_wrong, 204) \ + x(inode_dir_wrong_nlink, 205) \ + x(inode_dir_multiple_links, 206) \ + x(inode_multiple_links_but_nlink_0, 207) \ + x(inode_wrong_backpointer, 208) \ + x(inode_wrong_nlink, 209) \ + x(inode_unreachable, 210) \ + x(deleted_inode_but_clean, 211) \ + x(deleted_inode_missing, 212) \ + x(deleted_inode_is_dir, 213) \ + x(deleted_inode_not_unlinked, 214) \ + x(extent_overlapping, 215) \ + x(extent_in_missing_inode, 216) \ + x(extent_in_non_reg_inode, 217) \ + x(extent_past_end_of_inode, 218) \ + x(dirent_empty_name, 219) \ + x(dirent_val_too_big, 220) \ + x(dirent_name_too_long, 221) \ + x(dirent_name_embedded_nul, 222) \ + x(dirent_name_dot_or_dotdot, 223) \ + x(dirent_name_has_slash, 224) \ + x(dirent_d_type_wrong, 225) \ + x(dirent_d_parent_subvol_wrong, 226) \ + x(dirent_in_missing_dir_inode, 227) \ + x(dirent_in_non_dir_inode, 228) \ + x(dirent_to_missing_inode, 229) \ + x(dirent_to_missing_subvol, 230) \ + x(dirent_to_itself, 231) \ + x(quota_type_invalid, 232) \ + x(xattr_val_size_too_small, 233) \ + x(xattr_val_size_too_big, 234) \ + x(xattr_invalid_type, 235) \ + x(xattr_name_invalid_chars, 236) \ + x(xattr_in_missing_inode, 237) \ + x(root_subvol_missing, 238) \ + x(root_dir_missing, 239) \ + x(root_inode_not_dir, 240) \ + x(dir_loop, 241) \ + x(hash_table_key_duplicate, 242) \ + x(hash_table_key_wrong_offset, 243) + +enum bch_sb_error_id { +#define x(t, n) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + +extern const struct bch_sb_field_ops bch_sb_field_ops_errors; + +void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); + +void bch2_sb_errors_from_cpu(struct bch_fs *); + +void bch2_fs_sb_errors_exit(struct bch_fs *); +void bch2_fs_sb_errors_init_early(struct bch_fs *); +int bch2_fs_sb_errors_init(struct bch_fs *); + +#endif /* _BCACHEFS_SB_ERRORS_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h new file mode 100644 index 000000000000..b1c099843a39 --- /dev/null +++ b/fs/bcachefs/sb-errors_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_TYPES_H +#define _BCACHEFS_SB_ERRORS_TYPES_H + +#include "darray.h" + +struct bch_sb_error_entry_cpu { + u64 id:16, + nr:48; + u64 last_error_time; +}; + +typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; + +#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ + diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 6dd85bb996fe..bed0f857fe5b 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -7,21 +7,28 @@ #include "sb-members.h" #include "super-io.h" -/* Code for bch_sb_field_members_v1: */ +#define x(t, n, ...) [n] = #t, +static const char * const bch2_iops_measurements[] = { + BCH_IOPS_MEASUREMENTS() + NULL +}; -static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i) -{ - return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); -} +char * const bch2_member_error_strs[] = { + BCH_MEMBER_ERROR_TYPES() + NULL +}; +#undef x + +/* Code for bch_sb_field_members_v1: */ struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) { - return members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); + return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); } static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) { - struct bch_member ret, *p = members_v2_get_mut(mi, i); + struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); memset(&ret, 0, sizeof(ret)); memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); return ret; @@ -36,7 +43,8 @@ static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int { struct bch_member ret, *p = members_v1_get_mut(mi, i); memset(&ret, 0, sizeof(ret)); - memcpy(&ret, p, min_t(size_t, sizeof(struct bch_member), sizeof(ret))); return ret; + memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); + return ret; } struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) @@ -62,7 +70,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); - memmove(dst, members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); + memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); memset(dst + le16_to_cpu(mi->member_bytes), 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); } @@ -71,7 +79,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) return 0; } -int bch2_members_v2_init(struct bch_fs *c) +int bch2_sb_members_v2_init(struct bch_fs *c) { struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; @@ -91,7 +99,7 @@ int bch2_members_v2_init(struct bch_fs *c) return sb_members_v2_resize_entries(c); } -int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) +int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) { struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; @@ -105,7 +113,7 @@ int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) - memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); + memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); return 0; } @@ -155,6 +163,8 @@ static void member_to_text(struct printbuf *out, u64 bucket_size = le16_to_cpu(m.bucket_size); u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; + if (!bch2_member_exists(&m)) + return; prt_printf(out, "Device:"); prt_tab(out); @@ -163,6 +173,21 @@ static void member_to_text(struct printbuf *out, printbuf_indent_add(out, 2); + prt_printf(out, "Label:"); + prt_tab(out); + if (BCH_MEMBER_GROUP(&m)) { + unsigned idx = BCH_MEMBER_GROUP(&m) - 1; + + if (idx < disk_groups_nr(gi)) + prt_printf(out, "%s (%u)", + gi->entries[idx].label, idx); + else + prt_printf(out, "(bad disk labels section)"); + } else { + prt_printf(out, "(none)"); + } + prt_newline(out); + prt_printf(out, "UUID:"); prt_tab(out); pr_uuid(out, m.uuid.b); @@ -173,6 +198,13 @@ static void member_to_text(struct printbuf *out, prt_units_u64(out, device_size << 9); prt_newline(out); + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s errors:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, le64_to_cpu(m.errors[i])); + prt_newline(out); + } + for (unsigned i = 0; i < BCH_IOPS_NR; i++) { prt_printf(out, "%s iops:", bch2_iops_measurements[i]); prt_tab(out); @@ -198,7 +230,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Last mount:"); prt_tab(out); if (m.last_mount) - pr_time(out, le64_to_cpu(m.last_mount)); + bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); else prt_printf(out, "(never)"); prt_newline(out); @@ -211,21 +243,6 @@ static void member_to_text(struct printbuf *out, : "unknown"); prt_newline(out); - prt_printf(out, "Label:"); - prt_tab(out); - if (BCH_MEMBER_GROUP(&m)) { - unsigned idx = BCH_MEMBER_GROUP(&m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { - prt_printf(out, "(none)"); - } - prt_newline(out); - prt_printf(out, "Data allowed:"); prt_tab(out); if (BCH_MEMBER_DATA_ALLOWED(&m)) @@ -262,8 +279,7 @@ static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); unsigned i; - if ((void *) members_v1_get_mut(mi, sb->nr_devices) > - vstruct_end(&mi->field)) { + if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) { prt_printf(err, "too many devices for section size"); return -BCH_ERR_invalid_sb_members; } @@ -286,10 +302,8 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); unsigned i; - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = members_v1_get(mi, i); - member_to_text(out, m, gi, sb, i); - } + for (i = 0; i < sb->nr_devices; i++) + member_to_text(out, members_v1_get(mi, i), gi, sb, i); } const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { @@ -304,10 +318,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); unsigned i; - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member m = members_v2_get(mi, i); - member_to_text(out, m, gi, sb, i); - } + for (i = 0; i < sb->nr_devices; i++) + member_to_text(out, members_v2_get(mi, i), gi, sb, i); } static int bch2_sb_members_v2_validate(struct bch_sb *sb, @@ -315,7 +327,7 @@ static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct printbuf *err) { struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); - size_t mi_bytes = (void *) members_v2_get_mut(mi, sb->nr_devices) - + size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - (void *) mi; if (mi_bytes > vstruct_bytes(&mi->field)) { @@ -337,3 +349,72 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { .validate = bch2_sb_members_v2_validate, .to_text = bch2_sb_members_v2_to_text, }; + +void bch2_sb_members_from_cpu(struct bch_fs *c) +{ + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + struct bch_dev *ca; + unsigned i, e; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + struct bch_member *m = __bch2_members_v2_get_mut(mi, i); + + for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) + m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); + } + rcu_read_unlock(); +} + +void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_member m; + + mutex_lock(&ca->fs->sb_lock); + m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + mutex_unlock(&ca->fs->sb_lock); + + printbuf_tabstop_push(out, 12); + + prt_str(out, "IO errors since filesystem creation"); + prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, atomic64_read(&ca->errors[i])); + prt_newline(out); + } + printbuf_indent_sub(out, 2); + + prt_str(out, "IO errors since "); + bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); + prt_str(out, " ago"); + prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +void bch2_dev_errors_reset(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_member *m; + + mutex_lock(&c->sb_lock); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) + m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); + m->errors_reset_time = ktime_get_real_seconds(); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 430f3457bfd4..03613e3eb8e3 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -2,8 +2,16 @@ #ifndef _BCACHEFS_SB_MEMBERS_H #define _BCACHEFS_SB_MEMBERS_H -int bch2_members_v2_init(struct bch_fs *c); -int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); +extern char * const bch2_member_error_strs[]; + +static inline struct bch_member * +__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) +{ + return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); +} + +int bch2_sb_members_v2_init(struct bch_fs *c); +int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); @@ -179,4 +187,41 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; +static inline bool bch2_member_exists(struct bch_member *m) +{ + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); +} + +static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) +{ + if (dev < sb->nr_devices) { + struct bch_member m = bch2_sb_member_get(sb, dev); + return bch2_member_exists(&m); + } + return false; +} + +static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) +{ + return (struct bch_member_cpu) { + .nbuckets = le64_to_cpu(mi->nbuckets), + .first_bucket = le16_to_cpu(mi->first_bucket), + .bucket_size = le16_to_cpu(mi->bucket_size), + .group = BCH_MEMBER_GROUP(mi), + .state = BCH_MEMBER_STATE(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), + .durability = BCH_MEMBER_DURABILITY(mi) + ? BCH_MEMBER_DURABILITY(mi) - 1 + : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .valid = bch2_member_exists(mi), + }; +} + +void bch2_sb_members_from_cpu(struct bch_fs *); + +void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); +void bch2_dev_errors_reset(struct bch_dev *); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index b684b9f00c1b..b775cf0fb7cb 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -11,6 +11,8 @@ #include <linux/sched/task.h> #include <linux/slab.h> +#include <trace/events/lock.h> + #include "six.h" #ifdef DEBUG @@ -462,11 +464,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, smp_mb__after_atomic(); } + trace_contention_begin(lock, 0); + lock_contended(&lock->dep_map, ip); + if (six_optimistic_spin(lock, type)) goto out; - lock_contended(&lock->dep_map, ip); - wait->task = current; wait->lock_want = type; wait->lock_acquired = false; @@ -546,6 +549,7 @@ out: six_clear_bitmask(lock, SIX_LOCK_HELD_write); six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); } + trace_contention_end(lock, 0); return ret; } diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4982468bfe11..e9af77b384c7 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -30,17 +30,18 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(t.v->root_snapshot)); } -int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1))) { - prt_printf(err, "bad pos"); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1)), c, err, + snapshot_tree_pos_bad, + "bad pos"); +fsck_err: + return ret; } int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, @@ -202,68 +203,60 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, le32_to_cpu(s.v->skip[2])); } -int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { struct bkey_s_c_snapshot s; u32 i, id; + int ret = 0; - if (bkey_gt(k.k->p, POS(0, U32_MAX)) || - bkey_lt(k.k->p, POS(0, 1))) { - prt_printf(err, "bad pos"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1)), c, err, + snapshot_pos_bad, + "bad pos"); s = bkey_s_c_to_snapshot(k); id = le32_to_cpu(s.v->parent); - if (id && id <= k.k->p.offset) { - prt_printf(err, "bad parent node (%u <= %llu)", - id, k.k->p.offset); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, + snapshot_parent_bad, + "bad parent node (%u <= %llu)", + id, k.k->p.offset); - if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { - prt_printf(err, "children not normalized"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, + snapshot_children_not_normalized, + "children not normalized"); - if (s.v->children[0] && - s.v->children[0] == s.v->children[1]) { - prt_printf(err, "duplicate child nodes"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, + snapshot_child_duplicate, + "duplicate child nodes"); for (i = 0; i < 2; i++) { id = le32_to_cpu(s.v->children[i]); - if (id >= k.k->p.offset) { - prt_printf(err, "bad child node (%u >= %llu)", - id, k.k->p.offset); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(id >= k.k->p.offset, c, err, + snapshot_child_bad, + "bad child node (%u >= %llu)", + id, k.k->p.offset); } if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { - if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || - le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { - prt_printf(err, "skiplist not normalized"); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, + snapshot_skiplist_not_normalized, + "skiplist not normalized"); for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { id = le32_to_cpu(s.v->skip[i]); - if ((id && !s.v->parent) || - (id && id <= k.k->p.offset)) { - prt_printf(err, "bad skiplist node %u", id); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, + snapshot_skiplist_bad, + "bad skiplist node %u", id); } } - - return 0; +fsck_err: + return ret; } static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) @@ -325,8 +318,9 @@ int bch2_mark_snapshot(struct btree_trans *trans, __set_is_ancestor_bitmap(c, id); if (BCH_SNAPSHOT_DELETED(s.v)) { - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + bch2_delete_dead_snapshots_async(c); } } else { memset(t, 0, sizeof(*t)); @@ -529,7 +523,7 @@ static int check_snapshot_tree(struct btree_trans *trans, if (fsck_err_on(ret || root_id != bch2_snapshot_root(c, root_id) || st.k->p.offset != le32_to_cpu(s.tree), - c, + c, snapshot_tree_to_missing_snapshot, "snapshot tree points to missing/incorrect snapshot:\n %s", (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); @@ -541,17 +535,20 @@ static int check_snapshot_tree(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; - if (fsck_err_on(ret, c, + if (fsck_err_on(ret, + c, snapshot_tree_to_missing_subvol, "snapshot tree points to missing subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(!bch2_snapshot_is_ancestor_early(c, le32_to_cpu(subvol.snapshot), - root_id), c, + root_id), + c, snapshot_tree_to_wrong_subvol, "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || - fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, + fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), + c, snapshot_tree_to_snapshot_subvol, "snapshot tree points to snapshot subvolume:\n %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { @@ -787,7 +784,9 @@ static int check_snapshot(struct btree_trans *trans, goto err; } } else { - if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", + if (fsck_err_on(s.subvol, + c, snapshot_should_not_have_subvol, + "snapshot should not point to subvol:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -803,7 +802,8 @@ static int check_snapshot(struct btree_trans *trans, if (ret < 0) goto err; - if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", + if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree, + "snapshot points to missing/incorrect tree:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = snapshot_tree_ptr_repair(trans, iter, k, &s); if (ret) @@ -815,7 +815,8 @@ static int check_snapshot(struct btree_trans *trans, if (le32_to_cpu(s.depth) != real_depth && (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", + fsck_err(c, snapshot_bad_depth, + "snapshot with incorrect depth field, should be %u:\n %s", real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -832,7 +833,8 @@ static int check_snapshot(struct btree_trans *trans, if (!ret && (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || - fsck_err(c, "snapshot with bad skiplist field:\n %s", + fsck_err(c, snapshot_bad_skiplist, + "snapshot with bad skiplist field:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -1251,13 +1253,7 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans, return 0; } -/* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ -static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) +static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_s_c_snapshot snap; u32 children[2]; @@ -1278,10 +1274,21 @@ static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btre bch2_snapshot_live(trans, children[1]); if (ret < 0) return ret; + return !ret; +} - if (!ret) - return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); - return 0; +/* + * For a given snapshot, if it doesn't have a subvolume that points to it, and + * it doesn't have child snapshot nodes - it's now redundant and we can mark it + * as deleted. + */ +static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) +{ + int ret = bch2_snapshot_needs_delete(trans, k); + + return ret <= 0 + ? ret + : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, @@ -1342,12 +1349,12 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, u32 id = le32_to_cpu(s->v.skip[j]); if (snapshot_list_has_id(deleted, id)) { - id = depth > 1 - ? bch2_snapshot_nth_parent_skip(c, + id = bch2_snapshot_nth_parent_skip(c, parent, - get_random_u32_below(depth - 1), - deleted) - : parent; + depth > 1 + ? get_random_u32_below(depth - 1) + : 0, + deleted); s->v.skip[j] = cpu_to_le32(id); } } @@ -1369,6 +1376,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) u32 *i, id; int ret = 0; + if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) + return 0; + if (!test_bit(BCH_FS_STARTED, &c->flags)) { ret = bch2_fs_read_write_early(c); if (ret) { @@ -1386,7 +1396,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, NULL, NULL, 0, - bch2_delete_redundant_snapshot(trans, &iter, k)); + bch2_delete_redundant_snapshot(trans, k)); if (ret) { bch_err_msg(c, ret, "deleting redundant snapshots"); goto err; @@ -1427,6 +1437,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!btree_type_has_snapshots(id)) continue; + /* + * deleted inodes btree is maintained by a trigger on the inodes + * btree - no work for us to do here, and it's not safe to scan + * it because we'll see out of date keys due to the btree write + * buffer: + */ + if (id == BTREE_ID_deleted_inodes) + continue; + ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, @@ -1447,6 +1466,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) } } + bch2_trans_unlock(trans); down_write(&c->snapshot_create_lock); for_each_btree_key(trans, iter, BTREE_ID_snapshots, @@ -1491,8 +1511,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err_create_lock; } } - - clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); err_create_lock: up_write(&c->snapshot_create_lock); err: @@ -1508,8 +1526,7 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); - if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) - bch2_delete_dead_snapshots(c); + bch2_delete_dead_snapshots(c); bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } @@ -1520,20 +1537,6 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } -int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - struct bch_fs *c = trans->c; - - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); - - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) - return 0; - - bch2_delete_dead_snapshots_async(c); - return 0; -} - int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, enum btree_id id, struct bpos pos) @@ -1664,6 +1667,26 @@ again: return ret ?: trans_was_restarted(trans, restart_count); } +static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot snap; + int ret = 0; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || + (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + return 0; + } + + return ret; +} + int bch2_snapshots_read(struct bch_fs *c) { struct btree_iter iter; @@ -1674,7 +1697,8 @@ int bch2_snapshots_read(struct bch_fs *c) for_each_btree_key2(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(trans, k)) ?: + bch2_snapshot_set_equiv(trans, k) ?: + bch2_check_snapshot_needs_deletion(trans, k)) ?: for_each_btree_key2(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index de215d9d1252..f09a22f44239 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -5,7 +5,7 @@ enum bkey_invalid_flags; void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ @@ -19,7 +19,7 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s_c, unsigned); @@ -244,8 +244,6 @@ int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -int bch2_delete_dead_snapshots_hook(struct btree_trans *, - struct btree_trans_commit_hook *); void bch2_delete_dead_snapshots_work(struct work_struct *); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index caf2dd7dafff..fccd25aa3242 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -62,7 +62,8 @@ static int check_subvol(struct btree_trans *trans, if (ret) return ret; - if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c, + if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, + c, subvol_not_master_and_not_snapshot, "subvolume %llu is not set as snapshot but is not master subvolume", k.k->p.offset)) { struct bkey_i_subvolume *s = @@ -97,16 +98,17 @@ int bch2_check_subvols(struct bch_fs *c) /* Subvolumes: */ -int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || - bkey_gt(k.k->p, SUBVOL_POS_MAX)) { - prt_printf(err, "invalid pos"); - return -BCH_ERR_invalid_bkey; - } + int ret = 0; - return 0; + bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || + bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, + subvol_pos_bad, + "invalid pos"); +fsck_err: + return ret; } void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, @@ -230,7 +232,6 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { struct btree_iter iter; struct bkey_s_c_subvolume subvol; - struct btree_trans_commit_hook *h; u32 snapid; int ret = 0; @@ -246,22 +247,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) snapid = le32_to_cpu(subvol.v->snapshot); - ret = bch2_btree_delete_at(trans, &iter, 0); - if (ret) - goto err; - - ret = bch2_snapshot_node_set_deleted(trans, snapid); - if (ret) - goto err; - - h = bch2_trans_kmalloc(trans, sizeof(*h)); - ret = PTR_ERR_OR_ZERO(h); - if (ret) - goto err; - - h->fn = bch2_delete_dead_snapshots_hook; - bch2_trans_commit_hook(trans, h); -err: + ret = bch2_btree_delete_at(trans, &iter, 0) ?: + bch2_snapshot_node_set_deleted(trans, snapid); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index bb14f92e8687..a1003d30ab0a 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -9,7 +9,7 @@ enum bkey_invalid_flags; int bch2_check_subvols(struct bch_fs *); -int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 332d41e1c0a3..f4cad903f4d6 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -13,6 +13,7 @@ #include "replicas.h" #include "quota.h" #include "sb-clean.h" +#include "sb-errors.h" #include "sb-members.h" #include "super-io.h" #include "super.h" @@ -720,7 +721,7 @@ retry: if (opt_defined(*opts, sb)) goto err; - printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", + printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); printbuf_reset(&err); @@ -782,7 +783,7 @@ got_super: ret = bch2_sb_validate(sb, &err, READ); if (ret) { - printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); goto err_no_print; } @@ -790,7 +791,7 @@ out: printbuf_exit(&err); return ret; err: - printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", + printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", path, err.buf); err_no_print: bch2_free_super(sb); @@ -805,7 +806,12 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, + bio_data_dir(bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "superblock %s error: %s", + bio_data_dir(bio) ? "write" : "read", bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; @@ -892,7 +898,9 @@ int bch2_write_super(struct bch_fs *c) SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); bch2_sb_counters_from_cpu(c); - bch_members_cpy_v2_v1(&c->disk_sb); + bch2_sb_members_from_cpu(c); + bch2_sb_members_cpy_v2_v1(&c->disk_sb); + bch2_sb_errors_from_cpu(c); for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); @@ -1175,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "Created:"); prt_tab(out); if (sb->time_base_lo) - pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); else prt_printf(out, "(not set)"); prt_newline(out); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index b0d8584f475f..f5abd102bff7 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -23,6 +23,11 @@ u64 bch2_upgrade_recovery_passes(struct bch_fs *c, unsigned, unsigned); +static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) +{ + return le32_to_cpu(f->u64s) * sizeof(u64); +} + #define field_to_type(_f, _name) \ container_of_or_null(_f, struct bch_sb_field_##_name, field) @@ -78,41 +83,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) __bch2_check_set_feature(c, feat); } -/* BCH_SB_FIELD_members_v1: */ - -static inline bool bch2_member_exists(struct bch_member *m) -{ - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); -} - -static inline bool bch2_dev_exists(struct bch_sb *sb, - unsigned dev) -{ - if (dev < sb->nr_devices) { - struct bch_member m = bch2_sb_member_get(sb, dev); - return bch2_member_exists(&m); - } - return false; -} - -static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -{ - return (struct bch_member_cpu) { - .nbuckets = le64_to_cpu(mi->nbuckets), - .first_bucket = le16_to_cpu(mi->first_bucket), - .bucket_size = le16_to_cpu(mi->bucket_size), - .group = BCH_MEMBER_GROUP(mi), - .state = BCH_MEMBER_STATE(mi), - .discard = BCH_MEMBER_DISCARD(mi), - .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), - .durability = BCH_MEMBER_DURABILITY(mi) - ? BCH_MEMBER_DURABILITY(mi) - 1 - : 1, - .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .valid = bch2_member_exists(mi), - }; -} - void bch2_sb_maybe_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 0e85c22672be..24672bb31cbe 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -49,6 +49,7 @@ #include "recovery.h" #include "replicas.h" #include "sb-clean.h" +#include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" #include "subvolume.h" @@ -400,7 +401,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch_info(c, "going read-write"); - ret = bch2_members_v2_init(c); + ret = bch2_sb_members_v2_init(c); if (ret) goto err; @@ -481,6 +482,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_time_stats_exit(&c->times[i]); bch2_free_pending_node_rewrites(c); + bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); @@ -713,6 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_quota_init(c); bch2_fs_ec_init_early(c); bch2_fs_move_init(c); + bch2_fs_sb_errors_init_early(c); INIT_LIST_HEAD(&c->list); @@ -729,8 +732,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->journal_iters); - INIT_LIST_HEAD(&c->fsck_errors); - mutex_init(&c->fsck_error_lock); + INIT_LIST_HEAD(&c->fsck_error_msgs); + mutex_init(&c->fsck_error_msgs_lock); seqcount_init(&c->gc_pos_lock); @@ -840,6 +843,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) } ret = bch2_fs_counters_init(c) ?: + bch2_fs_sb_errors_init(c) ?: bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: @@ -942,16 +946,13 @@ int bch2_fs_start(struct bch_fs *c) mutex_lock(&c->sb_lock); - ret = bch2_members_v2_init(c); + ret = bch2_sb_members_v2_init(c); if (ret) { mutex_unlock(&c->sb_lock); goto err; } for_each_online_member(ca, c, i) - bch2_sb_from_fs(c, ca); - - for_each_online_member(ca, c, i) bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now); mutex_unlock(&c->sb_lock); @@ -960,12 +961,6 @@ int bch2_fs_start(struct bch_fs *c) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { - mutex_lock(&c->btree_transaction_stats[i].lock); - bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); - mutex_unlock(&c->btree_transaction_stats[i].lock); - } - ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); @@ -1140,6 +1135,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, struct bch_member *member) { struct bch_dev *ca; + unsigned i; ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) @@ -1157,6 +1153,10 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, bch2_time_stats_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); + + for (i = 0; i < ARRAY_SIZE(member->errors); i++) + atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); + ca->uuid = member->uuid; ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, @@ -1591,7 +1591,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { - bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); + bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); if (label.allocation_failure) { ret = -ENOMEM; goto err; @@ -1631,16 +1631,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_unlock; } - mi = bch2_sb_field_get(ca->disk_sb.sb, members_v2); - - if (!bch2_sb_field_resize(&ca->disk_sb, members_v2, - le32_to_cpu(mi->field.u64s) + - sizeof(dev_mi) / sizeof(u64))) { - ret = -BCH_ERR_ENOSPC_sb_members; - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - } - if (dynamic_fault("bcachefs:add:no_slot")) goto no_slot; @@ -1654,6 +1644,8 @@ no_slot: have_slot: nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + + mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); @@ -1689,13 +1681,13 @@ have_slot: ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { - bch_err_msg(c, ret, "marking new superblock"); + bch_err_msg(ca, ret, "marking new superblock"); goto err_late; } ret = bch2_fs_freespace_init(c); if (ret) { - bch_err_msg(c, ret, "initializing free space"); + bch_err_msg(ca, ret, "initializing free space"); goto err_late; } @@ -1763,19 +1755,26 @@ int bch2_dev_online(struct bch_fs *c, const char *path) if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); - mutex_lock(&c->sb_lock); - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + if (!ca->mi.freespace_initialized) { + ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) + goto err; + } - m->last_mount = - cpu_to_le64(ktime_get_real_seconds()); + if (!ca->journal.nr) { + ret = bch2_dev_journal_alloc(ca); + bch_err_msg(ca, ret, "allocating journal"); + if (ret) + goto err; + } + mutex_lock(&c->sb_lock); + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); - ret = bch2_fs_freespace_init(c); - if (ret) - bch_err_msg(c, ret, "initializing free space"); - up_write(&c->state_lock); return 0; err: @@ -1886,9 +1885,9 @@ found: struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { - struct bch_sb_handle *sb = NULL; + DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; - unsigned i, best_sb = 0; + struct bch_sb_handle *sb, *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; @@ -1900,49 +1899,46 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err; } - sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); - if (!sb) { - ret = -ENOMEM; + ret = darray_make_room(&sbs, nr_devices); + if (ret) goto err; - } - for (i = 0; i < nr_devices; i++) { - ret = bch2_read_super(devices[i], &opts, &sb[i]); + for (unsigned i = 0; i < nr_devices; i++) { + struct bch_sb_handle sb = { NULL }; + + ret = bch2_read_super(devices[i], &opts, &sb); if (ret) goto err; + BUG_ON(darray_push(&sbs, sb)); } - for (i = 1; i < nr_devices; i++) - if (le64_to_cpu(sb[i].sb->seq) > - le64_to_cpu(sb[best_sb].sb->seq)) - best_sb = i; - - i = 0; - while (i < nr_devices) { - if (i != best_sb && - !bch2_dev_exists(sb[best_sb].sb, sb[i].sb->dev_idx)) { - pr_info("%pg has been removed, skipping", sb[i].bdev); - bch2_free_super(&sb[i]); - array_remove_item(sb, nr_devices, i); + darray_for_each(sbs, sb) + if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + best = sb; + + darray_for_each_reverse(sbs, sb) { + if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { + pr_info("%pg has been removed, skipping", sb->bdev); + bch2_free_super(sb); + darray_remove_item(&sbs, sb); + best -= best > sb; continue; } - ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); + ret = bch2_dev_in_fs(best->sb, sb->sb); if (ret) goto err_print; - i++; } - c = bch2_fs_alloc(sb[best_sb].sb, opts); - if (IS_ERR(c)) { - ret = PTR_ERR(c); + c = bch2_fs_alloc(best->sb, opts); + ret = PTR_ERR_OR_ZERO(c); + if (ret) goto err; - } down_write(&c->state_lock); - for (i = 0; i < nr_devices; i++) { - ret = bch2_dev_attach_bdev(c, &sb[i]); + darray_for_each(sbs, sb) { + ret = bch2_dev_attach_bdev(c, sb); if (ret) { up_write(&c->state_lock); goto err; @@ -1961,7 +1957,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err; } out: - kfree(sb); + darray_for_each(sbs, sb) + bch2_free_super(sb); + darray_exit(&sbs); printbuf_exit(&errbuf); module_put(THIS_MODULE); return c; @@ -1971,9 +1969,6 @@ err_print: err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); - if (sb) - for (i = 0; i < nr_devices; i++) - bch2_free_super(&sb[i]); c = ERR_PTR(ret); goto out; } diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 78d6138db62d..7dda4985b99f 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -37,16 +37,4 @@ struct bch_member_cpu { u8 valid; }; -struct bch_disk_group_cpu { - bool deleted; - u16 parent; - struct bch_devs_mask devs; -}; - -struct bch_disk_groups_cpu { - struct rcu_head rcu; - unsigned nr; - struct bch_disk_group_cpu entries[] __counted_by(nr); -}; - #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 397116966a7c..ab743115f169 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -149,7 +149,9 @@ read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); rw_attribute(durability); -read_attribute(iodone); +read_attribute(io_done); +read_attribute(io_errors); +write_attribute(io_errors_reset); read_attribute(io_latency_read); read_attribute(io_latency_write); @@ -212,7 +214,7 @@ read_attribute(copy_gc_wait); rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); -read_attribute(rebalance_work); +read_attribute(rebalance_status); rw_attribute(promote_whole_extents); read_attribute(new_stripes); @@ -341,7 +343,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) { - prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); + prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); bch2_bpos_to_text(out, c->gc_gens_pos); prt_printf(out, "\n"); } @@ -386,8 +388,8 @@ SHOW(bch2_fs) if (attr == &sysfs_copy_gc_wait) bch2_copygc_wait_to_text(out, c); - if (attr == &sysfs_rebalance_work) - bch2_rebalance_work_to_text(out, c); + if (attr == &sysfs_rebalance_status) + bch2_rebalance_status_to_text(out, c); sysfs_print(promote_whole_extents, c->promote_whole_extents); @@ -646,7 +648,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_copy_gc_wait, &sysfs_rebalance_enabled, - &sysfs_rebalance_work, + &sysfs_rebalance_status, sysfs_pd_controller_files(rebalance), &sysfs_moving_ctxts, @@ -707,10 +709,8 @@ STORE(bch2_fs_opts_dir) bch2_opt_set_by_id(&c->opts, id, v); if ((id == Opt_background_target || - id == Opt_background_compression) && v) { - bch2_rebalance_add_work(c, S64_MAX); - rebalance_wakeup(c); - } + id == Opt_background_compression) && v) + bch2_set_rebalance_needs_scan(c, 0); ret = size; err: @@ -882,7 +882,7 @@ static const char * const bch2_rw[] = { NULL }; -static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) +static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) { int rw, i; @@ -910,13 +910,8 @@ SHOW(bch2_dev) sysfs_print(discard, ca->mi.discard); if (attr == &sysfs_label) { - if (ca->mi.group) { - mutex_lock(&c->sb_lock); - bch2_disk_path_to_text(out, c->disk_sb.sb, - ca->mi.group - 1); - mutex_unlock(&c->sb_lock); - } - + if (ca->mi.group) + bch2_disk_path_to_text(out, c, ca->mi.group - 1); prt_char(out, '\n'); } @@ -930,8 +925,11 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_iodone) - dev_iodone_to_text(out, ca); + if (attr == &sysfs_io_done) + dev_io_done_to_text(out, ca); + + if (attr == &sysfs_io_errors) + bch2_dev_io_errors_to_text(out, ca); sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); @@ -998,6 +996,9 @@ STORE(bch2_dev) return ret; } + if (attr == &sysfs_io_errors_reset) + bch2_dev_errors_reset(ca); + return size; } SYSFS_OPS(bch2_dev); @@ -1015,7 +1016,9 @@ struct attribute *bch2_dev_files[] = { &sysfs_label, &sysfs_has_data, - &sysfs_iodone, + &sysfs_io_done, + &sysfs_io_errors, + &sysfs_io_errors_reset, &sysfs_io_latency_read, &sysfs_io_latency_write, diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c index 33efa6005c6f..dc48b52b01b4 100644 --- a/fs/bcachefs/trace.c +++ b/fs/bcachefs/trace.c @@ -7,6 +7,7 @@ #include "btree_locking.h" #include "btree_update_interior.h" #include "keylist.h" +#include "move_types.h" #include "opts.h" #include "six.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 19264492151b..893304a1f06e 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(btree_node, TP_printk("%d,%d %u %s %llu:%llu:%u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->level, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) ); @@ -461,7 +461,7 @@ TRACE_EVENT(btree_path_relock_fail, TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, @@ -522,7 +522,7 @@ TRACE_EVENT(btree_path_upgrade_fail, TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, @@ -767,25 +767,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, ); TRACE_EVENT(move_data, - TP_PROTO(struct bch_fs *c, u64 sectors_moved, - u64 keys_moved), - TP_ARGS(c, sectors_moved, keys_moved), + TP_PROTO(struct bch_fs *c, + struct bch_move_stats *stats), + TP_ARGS(c, stats), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, sectors_moved ) + __field(dev_t, dev ) __field(u64, keys_moved ) + __field(u64, keys_raced ) + __field(u64, sectors_seen ) + __field(u64, sectors_moved ) + __field(u64, sectors_raced ) ), TP_fast_assign( - __entry->dev = c->dev; - __entry->sectors_moved = sectors_moved; - __entry->keys_moved = keys_moved; + __entry->dev = c->dev; + __entry->keys_moved = atomic64_read(&stats->keys_moved); + __entry->keys_raced = atomic64_read(&stats->keys_raced); + __entry->sectors_seen = atomic64_read(&stats->sectors_seen); + __entry->sectors_moved = atomic64_read(&stats->sectors_moved); + __entry->sectors_raced = atomic64_read(&stats->sectors_raced); ), - TP_printk("%d,%d sectors_moved %llu keys_moved %llu", + TP_printk("%d,%d keys moved %llu raced %llu" + "sectors seen %llu moved %llu raced %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->sectors_moved, __entry->keys_moved) + __entry->keys_moved, + __entry->keys_raced, + __entry->sectors_seen, + __entry->sectors_moved, + __entry->sectors_raced) ); TRACE_EVENT(evacuate_bucket, @@ -1012,7 +1023,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, TP_printk("%s %pS btree %s pos %llu:%llu:%u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) @@ -1032,13 +1043,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); +struct get_locks_fail; + TRACE_EVENT(trans_restart_upgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, struct btree_path *path, unsigned old_locks_want, - unsigned new_locks_want), - TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), + unsigned new_locks_want, + struct get_locks_fail *f), + TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), TP_STRUCT__entry( __array(char, trans_fn, 32 ) @@ -1046,6 +1060,11 @@ TRACE_EVENT(trans_restart_upgrade, __field(u8, btree_id ) __field(u8, old_locks_want ) __field(u8, new_locks_want ) + __field(u8, level ) + __field(u32, path_seq ) + __field(u32, node_seq ) + __field(u32, path_alloc_seq ) + __field(u32, downgrade_seq) TRACE_BPOS_entries(pos) ), @@ -1055,18 +1074,28 @@ TRACE_EVENT(trans_restart_upgrade, __entry->btree_id = path->btree_id; __entry->old_locks_want = old_locks_want; __entry->new_locks_want = new_locks_want; + __entry->level = f->l; + __entry->path_seq = path->l[f->l].lock_seq; + __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; + __entry->path_alloc_seq = path->alloc_seq; + __entry->downgrade_seq = path->downgrade_seq; TRACE_BPOS_assign(pos, path->pos) ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, __entry->old_locks_want, - __entry->new_locks_want) + __entry->new_locks_want, + __entry->level, + __entry->path_seq, + __entry->node_seq, + __entry->path_alloc_seq, + __entry->downgrade_seq) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, @@ -1219,7 +1248,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", __entry->trans_fn, (void *) __entry->caller_ip, - bch2_btree_ids[__entry->btree_id], + bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot, @@ -1227,6 +1256,27 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); +TRACE_EVENT(path_downgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + + TP_printk("%s %pS", + __entry->trans_fn, + (void *) __entry->caller_ip) +); + DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 08bac0ba8d0b..84b142fcc3df 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -467,6 +467,24 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) prt_printf(out, "%s", u->name); } +#ifndef __KERNEL__ +#include <time.h> +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + time_t t = sec; + char buf[64]; + ctime_r(&t, buf); + prt_str(out, buf); +} +#else +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ptT", &sec); + prt_u64(out, sec); +} +#endif + #define TABSTOP_SIZE 12 static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 849a37ae497c..2984b57b2958 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -245,26 +245,7 @@ do { \ #define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) void bch2_pr_time_units(struct printbuf *, u64); - -#ifdef __KERNEL__ -static inline void pr_time(struct printbuf *out, u64 time) -{ - prt_printf(out, "%llu", time); -} -#else -#include <time.h> -static inline void pr_time(struct printbuf *out, u64 _time) -{ - char time_str[64]; - time_t time = _time; - struct tm *tm = localtime(&time); - size_t err = strftime(time_str, sizeof(time_str), "%c", tm); - if (!err) - prt_printf(out, "(formatting error)"); - else - prt_printf(out, "%s", time_str); -} -#endif +void bch2_prt_datetime(struct printbuf *, time64_t); #ifdef __KERNEL__ static inline void uuid_unparse_lower(u8 *uuid, char *out) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index b069b1a62e25..a39ff0c296ec 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -70,46 +70,38 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { .cmp_bkey = xattr_cmp_bkey, }; -int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, +int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { - const struct xattr_handler *handler; struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len)); + int ret = 0; - if (bkey_val_u64s(k.k) < - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len))) { - prt_printf(err, "value too small (%zu < %u)", - bkey_val_u64s(k.k), - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len))); - return -BCH_ERR_invalid_bkey; - } + bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, + xattr_val_size_too_small, + "value too small (%zu < %u)", + bkey_val_u64s(k.k), val_u64s); /* XXX why +4 ? */ - if (bkey_val_u64s(k.k) > - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4)) { - prt_printf(err, "value too big (%zu > %u)", - bkey_val_u64s(k.k), - xattr_val_u64s(xattr.v->x_name_len, - le16_to_cpu(xattr.v->x_val_len) + 4)); - return -BCH_ERR_invalid_bkey; - } - - handler = bch2_xattr_type_to_handler(xattr.v->x_type); - if (!handler) { - prt_printf(err, "invalid type (%u)", xattr.v->x_type); - return -BCH_ERR_invalid_bkey; - } - - if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { - prt_printf(err, "xattr name has invalid characters"); - return -BCH_ERR_invalid_bkey; - } - - return 0; + val_u64s = xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4); + + bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, + xattr_val_size_too_big, + "value too big (%zu > %u)", + bkey_val_u64s(k.k), val_u64s); + + bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, + xattr_invalid_type, + "invalid type (%u)", xattr.v->x_type); + + bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, + xattr_name_invalid_chars, + "xattr name has invalid characters"); +fsck_err: + return ret; } void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, @@ -590,7 +582,7 @@ err: if (value && (opt_id == Opt_background_compression || opt_id == Opt_background_target)) - bch2_rebalance_add_work(c, inode->v.i_blocks); + bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); return bch2_err_class(ret); } diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index f5a52e3a6016..1337f31a5c49 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -6,7 +6,7 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, +int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 9acdec56f626..a93d76df8ed8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -96,6 +96,7 @@ static const struct address_space_operations befs_symlink_aops = { }; static const struct export_operations befs_export_operations = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = befs_fh_to_dentry, .fh_to_parent = befs_fh_to_parent, .get_parent = befs_get_parent, diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 87b3753aa4b1..c45e8c2d62e1 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -939,7 +939,7 @@ static ssize_t debugfs_write_file_str(struct file *file, const char __user *user new[pos + count] = '\0'; strim(new); - rcu_assign_pointer(*(char **)file->private_data, new); + rcu_assign_pointer(*(char __rcu **)file->private_data, new); synchronize_rcu(); kfree(old); diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 76dd3c7295d9..91290fe4a70b 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -21,8 +21,12 @@ struct inode *efivarfs_get_inode(struct super_block *sb, dev_t dev, bool is_removable) { struct inode *inode = new_inode(sb); + struct efivarfs_fs_info *fsi = sb->s_fs_info; + struct efivarfs_mount_opts *opts = &fsi->mount_opts; if (inode) { + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index 8ebf3a6a8aa2..c66647f5c0bd 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -9,6 +9,15 @@ #include <linux/list.h> #include <linux/efi.h> +struct efivarfs_mount_opts { + kuid_t uid; + kgid_t gid; +}; + +struct efivarfs_fs_info { + struct efivarfs_mount_opts mount_opts; +}; + struct efi_variable { efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)]; efi_guid_t VendorGuid; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 996271473609..77240953a92e 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -8,6 +8,7 @@ #include <linux/efi.h> #include <linux/fs.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/ucs2_string.h> @@ -24,12 +25,28 @@ static void efivarfs_evict_inode(struct inode *inode) clear_inode(inode); } +static int efivarfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct efivarfs_fs_info *sbi = sb->s_fs_info; + struct efivarfs_mount_opts *opts = &sbi->mount_opts; + + if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); + if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); + return 0; +} + static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) { const u32 attr = EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS; u64 storage_space, remaining_space, max_variable_size; + u64 id = huge_encode_dev(dentry->d_sb->s_dev); efi_status_t status; /* Some UEFI firmware does not implement QueryVariableInfo() */ @@ -53,6 +70,7 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = storage_space; buf->f_bfree = remaining_space; buf->f_type = dentry->d_sb->s_magic; + buf->f_fsid = u64_to_fsid(id); /* * In f_bavail we declare the free space that the kernel will allow writing @@ -70,6 +88,7 @@ static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, .drop_inode = generic_delete_inode, .evict_inode = efivarfs_evict_inode, + .show_options = efivarfs_show_options, }; /* @@ -231,6 +250,45 @@ static int efivarfs_destroy(struct efivar_entry *entry, void *data) return 0; } +enum { + Opt_uid, Opt_gid, +}; + +static const struct fs_parameter_spec efivarfs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + {}, +}; + +static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct efivarfs_fs_info *sbi = fc->s_fs_info; + struct efivarfs_mount_opts *opts = &sbi->mount_opts; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, efivarfs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(opts->uid)) + return -EINVAL; + break; + case Opt_gid: + opts->gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(opts->gid)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode = NULL; @@ -277,10 +335,21 @@ static int efivarfs_get_tree(struct fs_context *fc) static const struct fs_context_operations efivarfs_context_ops = { .get_tree = efivarfs_get_tree, + .parse_param = efivarfs_parse_param, }; static int efivarfs_init_fs_context(struct fs_context *fc) { + struct efivarfs_fs_info *sfi; + + sfi = kzalloc(sizeof(*sfi), GFP_KERNEL); + if (!sfi) + return -ENOMEM; + + sfi->mount_opts.uid = GLOBAL_ROOT_UID; + sfi->mount_opts.gid = GLOBAL_ROOT_GID; + + fc->s_fs_info = sfi; fc->ops = &efivarfs_context_ops; return 0; } @@ -301,6 +370,7 @@ static struct file_system_type efivarfs_type = { .name = "efivarfs", .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, + .parameters = efivarfs_parameters, }; static __init int efivarfs_init(void) diff --git a/fs/efs/super.c b/fs/efs/super.c index b287f47c165b..f17fdac76b2e 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -123,6 +123,7 @@ static const struct super_operations efs_superblock_operations = { }; static const struct export_operations efs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = efs_fh_to_dentry, .fh_to_parent = efs_fh_to_parent, .get_parent = efs_get_parent, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 976dc39a88f7..3789d6224513 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -567,6 +567,7 @@ static struct dentry *erofs_get_parent(struct dentry *child) } static const struct export_operations erofs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = erofs_fh_to_dentry, .fh_to_parent = erofs_fh_to_parent, .get_parent = erofs_get_parent, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index c20704aa21b3..3ae0154c5680 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -342,43 +342,30 @@ out: return error; } +#define FILEID_INO64_GEN_LEN 3 + /** - * export_encode_fh - default export_operations->encode_fh function + * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id * @inode: the object to encode * @fid: where to store the file handle fragment - * @max_len: maximum length to store there - * @parent: parent directory inode, if wanted + * @max_len: maximum length to store there (in 4 byte units) * - * This default encode_fh function assumes that the 32 inode number - * is suitable for locating an inode, and that the generation number - * can be used to check that it is still valid. It places them in the - * filehandle fragment where export_decode_fh expects to find them. + * This generic function is used to encode a non-decodeable file id for + * fanotify for filesystems that do not support NFS export. */ -static int export_encode_fh(struct inode *inode, struct fid *fid, - int *max_len, struct inode *parent) +static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid, + int *max_len) { - int len = *max_len; - int type = FILEID_INO32_GEN; - - if (parent && (len < 4)) { - *max_len = 4; - return FILEID_INVALID; - } else if (len < 2) { - *max_len = 2; + if (*max_len < FILEID_INO64_GEN_LEN) { + *max_len = FILEID_INO64_GEN_LEN; return FILEID_INVALID; } - len = 2; - fid->i32.ino = inode->i_ino; - fid->i32.gen = inode->i_generation; - if (parent) { - fid->i32.parent_ino = parent->i_ino; - fid->i32.parent_gen = parent->i_generation; - len = 4; - type = FILEID_INO32_GEN_PARENT; - } - *max_len = len; - return type; + fid->i64.ino = inode->i_ino; + fid->i64.gen = inode->i_generation; + *max_len = FILEID_INO64_GEN_LEN; + + return FILEID_INO64_GEN; } /** @@ -396,17 +383,13 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, { const struct export_operations *nop = inode->i_sb->s_export_op; - /* - * If a decodeable file handle was requested, we need to make sure that - * filesystem can decode file handles. - */ - if (nop && !(flags & EXPORT_FH_FID) && !nop->fh_to_dentry) + if (!exportfs_can_encode_fh(nop, flags)) return -EOPNOTSUPP; - if (nop && nop->encode_fh) - return nop->encode_fh(inode, fid->raw, max_len, parent); + if (!nop && (flags & EXPORT_FH_FID)) + return exportfs_encode_ino64_fid(inode, fid, max_len); - return export_encode_fh(inode, fid, max_len, parent); + return nop->encode_fh(inode, fid->raw, max_len, parent); } EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); @@ -456,7 +439,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, /* * Try to get any dentry for the given file handle from the filesystem. */ - if (!nop || !nop->fh_to_dentry) + if (!exportfs_can_decode_fh(nop)) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (IS_ERR_OR_NULL(result)) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 645ee6142f69..01f9addc8b1f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -397,6 +397,7 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid, } static const struct export_operations ext2_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext2_fh_to_dentry, .fh_to_parent = ext2_fh_to_parent, .get_parent = ext2_get_parent, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 54a9dde7483a..c5fcf377ab1f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1654,6 +1654,7 @@ static const struct super_operations ext4_sops = { }; static const struct export_operations ext4_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d820801f473e..36e5dab6baae 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -893,14 +893,15 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc) bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { +#ifdef CONFIG_F2FS_CHECK_FS struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; - bool compressed = dn->data_blkaddr == COMPRESS_ADDR; int cluster_end = 0; + unsigned int count; int i; char *reason = ""; - if (!compressed) + if (dn->data_blkaddr != COMPRESS_ADDR) return false; /* [..., COMPR_ADDR, ...] */ @@ -909,7 +910,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) goto out; } - for (i = 1; i < cluster_size; i++) { + for (i = 1, count = 1; i < cluster_size; i++, count++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); @@ -929,19 +930,42 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) goto out; } } + + f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size && + !is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED)); + return false; out: f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); set_sbi_flag(sbi, SBI_NEED_FSCK); return true; +#else + return false; +#endif +} + +static int __f2fs_get_cluster_blocks(struct inode *inode, + struct dnode_of_data *dn) +{ + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, i; + + for (i = 1, count = 1; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (__is_valid_data_blkaddr(blkaddr)) + count++; + } + + return count; } static int __f2fs_cluster_blocks(struct inode *inode, - unsigned int cluster_idx, bool compr) + unsigned int cluster_idx, bool compr_blks) { struct dnode_of_data dn; - unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; unsigned int start_idx = cluster_idx << F2FS_I(inode)->i_log_cluster_size; int ret; @@ -956,31 +980,14 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; - f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } if (dn.data_blkaddr == COMPRESS_ADDR) { - int i; - - ret = 1; - for (i = 1; i < cluster_size; i++) { - block_t blkaddr; - - blkaddr = data_blkaddr(dn.inode, - dn.node_page, dn.ofs_in_node + i); - if (compr) { - if (__is_valid_data_blkaddr(blkaddr)) - ret++; - } else { - if (blkaddr != NULL_ADDR) - ret++; - } - } - - f2fs_bug_on(F2FS_I_SB(inode), - !compr && ret != cluster_size && - !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)); + if (compr_blks) + ret = __f2fs_get_cluster_blocks(inode, &dn); + else + ret = 1; } fail: f2fs_put_dnode(&dn); @@ -993,7 +1000,7 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc) return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); } -/* return # of valid blocks in compressed cluster */ +/* return whether cluster is compressed one or not */ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { return __f2fs_cluster_blocks(inode, @@ -1976,7 +1983,7 @@ void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; - char slab_name[32]; + char slab_name[35]; if (!f2fs_sb_has_compression(sbi)) return 0; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 916e317ac925..4e42b5f24deb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1690,9 +1690,7 @@ next_block: map->m_flags |= F2FS_MAP_NEW; } else if (is_hole) { if (f2fs_compressed_file(inode) && - f2fs_sanity_check_cluster(&dn) && - (flag != F2FS_GET_BLOCK_FIEMAP || - IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + f2fs_sanity_check_cluster(&dn)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_CLUSTER); @@ -2344,8 +2342,10 @@ skip_reading_dnode: f2fs_wait_on_block_writeback(inode, blkaddr); if (f2fs_load_compressed_page(sbi, page, blkaddr)) { - if (atomic_dec_and_test(&dic->remaining_pages)) + if (atomic_dec_and_test(&dic->remaining_pages)) { f2fs_decompress_cluster(dic, true); + break; + } continue; } @@ -2665,6 +2665,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */ + if (f2fs_compressed_file(inode) && + F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) @@ -3023,7 +3028,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0, retry = 0; - struct page *pages[F2FS_ONSTACK_PAGES]; + struct page *pages_local[F2FS_ONSTACK_PAGES]; + struct page **pages = pages_local; struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; @@ -3047,6 +3053,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, #endif int nr_folios, p, idx; int nr_pages; + unsigned int max_pages = F2FS_ONSTACK_PAGES; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -3056,6 +3063,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode) && + 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) { + pages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL); + max_pages = 1 << cc.log_cluster_size; + } +#endif + folio_batch_init(&fbatch); if (get_dirty_pages(mapping->host) <= @@ -3101,7 +3117,7 @@ again: add_more: pages[nr_pages] = folio_page(folio, idx); folio_get(folio); - if (++nr_pages == F2FS_ONSTACK_PAGES) { + if (++nr_pages == max_pages) { index = folio->index + idx + 1; folio_batch_release(&fbatch); goto write; @@ -3283,6 +3299,11 @@ next: if (bio) f2fs_submit_merged_ipu_write(sbi, &bio, NULL); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (pages != pages_local) + kfree(pages); +#endif + return ret; } @@ -4055,7 +4076,7 @@ next: sis->highest_bit = cur_lblock - 1; out: if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)", + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 0e2d49140c07..ad8dfac73bd4 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -74,40 +74,14 @@ static void __set_extent_info(struct extent_info *ei, } } -static bool __may_read_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, READ_EXTENT_CACHE)) - return false; - if (is_inode_flag_set(inode, FI_NO_EXTENT)) - return false; - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi)) - return false; - return S_ISREG(inode->i_mode); -} - -static bool __may_age_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, AGE_EXTENT_CACHE)) - return false; - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return false; - if (file_is_cold(inode)) - return false; - - return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); -} - static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) { if (type == EX_READ) - return __may_read_extent_tree(inode); - else if (type == EX_BLOCK_AGE) - return __may_age_extent_tree(inode); + return test_opt(F2FS_I_SB(inode), READ_EXTENT_CACHE) && + S_ISREG(inode->i_mode); + if (type == EX_BLOCK_AGE) + return test_opt(F2FS_I_SB(inode), AGE_EXTENT_CACHE) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)); return false; } @@ -120,7 +94,22 @@ static bool __may_extent_tree(struct inode *inode, enum extent_type type) if (list_empty(&F2FS_I_SB(inode)->s_list)) return false; - return __init_may_extent_tree(inode, type); + if (!__init_may_extent_tree(inode, type)) + return false; + + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(F2FS_I_SB(inode))) + return false; + } else if (type == EX_BLOCK_AGE) { + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + } + return true; } static void __try_update_largest_extent(struct extent_tree *et, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index dd99abbb7186..e50363583f01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3258,11 +3258,12 @@ int f2fs_precache_extents(struct inode *inode) return -EOPNOTSUPP; map.m_lblk = 0; + map.m_pblk = 0; map.m_next_pgofs = NULL; map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = max_file_blocks(inode); + end = F2FS_BLK_ALIGN(i_size_read(inode)); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; @@ -3270,7 +3271,7 @@ int f2fs_precache_extents(struct inode *inode) f2fs_down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - if (err) + if (err || !map.m_len) return err; map.m_lblk = m_next_extent; @@ -4005,6 +4006,15 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) F2FS_I(inode)->i_compress_algorithm = option.algorithm; F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); + /* Set default level */ + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) + F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + else + F2FS_I(inode)->i_compress_level = 0; + /* Adjust mount option level */ + if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level; f2fs_mark_inode_dirty_sync(inode, true); if (!f2fs_is_compress_backend_ready(inode)) @@ -4849,6 +4859,9 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, filp->f_mode &= ~FMODE_RANDOM; spin_unlock(&filp->f_lock); return 0; + } else if (advice == POSIX_FADV_WILLNEED && offset == 0) { + /* Load extent cache at the first readahead. */ + f2fs_precache_extents(inode); } err = generic_fadvise(filp, offset, len, advice); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5779c7edd49b..560bfcad1af2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -315,7 +315,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) f2fs_has_inline_xattr(inode) && (!fi->i_inline_xattr_size || fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %lu", __func__, inode->i_ino, fi->i_inline_xattr_size, MAX_INLINE_XATTR_SIZE); return false; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ee2e1dd64f25..6c7f6a649d27 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -633,7 +633,7 @@ static void f2fs_ra_node_pages(struct page *parent, int start, int n) /* Then, try readahead for siblings of the desired node */ end = start + n; - end = min(end, NIDS_PER_BLOCK); + end = min(end, (int)NIDS_PER_BLOCK); for (i = start; i < end; i++) { nid = get_nid(parent, i, false); f2fs_ra_node_page(sbi, nid); @@ -1467,7 +1467,8 @@ page_hit: ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + err = -EFSCORRUPTED; out_err: ClearPageUptodate(page); out_put_err: @@ -2389,7 +2390,7 @@ static int scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); if (blk_addr == NEW_ADDR) - return -EINVAL; + return -EFSCORRUPTED; if (blk_addr == NULL_ADDR) { add_free_nid(sbi, start_nid, true, true); @@ -2504,7 +2505,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, if (ret) { f2fs_up_read(&nm_i->nat_tree_lock); - f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); + + if (ret == -EFSCORRUPTED) { + f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_NAT); + } + return ret; } } @@ -2743,7 +2751,9 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + if (page) + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), + VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d05b41608fc0..727d016318f9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4910,22 +4910,31 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, } /* - * The write pointer matches with the valid blocks or - * already points to the end of the zone. + * When safely unmounted in the previous mount, we can trust write + * pointers. Otherwise, finish zones. */ - if ((last_valid_block + 1 == wp_block) || - (zone->wp == zone->start + zone->len)) - return 0; - - if (last_valid_block + 1 == zone_block) { + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { /* - * If there is no valid block in the zone and if write pointer - * is not at zone start, reset the write pointer. + * The write pointer matches with the valid blocks or + * already points to the end of the zone. */ - f2fs_notice(sbi, - "Zone without valid block has non-zero write " - "pointer. Reset the write pointer: wp[0x%x,0x%x]", - wp_segno, wp_blkoff); + if ((last_valid_block + 1 == wp_block) || + (zone->wp == zone->start + zone->len)) + return 0; + } + + if (last_valid_block + 1 == zone_block) { + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + /* + * If there is no valid block in the zone and if write + * pointer is not at zone start, reset the write + * pointer. + */ + f2fs_notice(sbi, + "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: wp[0x%x,0x%x]", + wp_segno, wp_blkoff); + } ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) @@ -4935,18 +4944,20 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, return ret; } - /* - * If there are valid blocks and the write pointer doesn't - * match with them, we need to report the inconsistency and - * fill the zone till the end to close the zone. This inconsistency - * does not cause write error because the zone will not be selected - * for write operation until it get discarded. - */ - f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: " - "valid block[0x%x,0x%x] wp[0x%x,0x%x]", - GET_SEGNO(sbi, last_valid_block), - GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), - wp_segno, wp_blkoff); + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + /* + * If there are valid blocks and the write pointer doesn't match + * with them, we need to report the inconsistency and fill + * the zone till the end to close the zone. This inconsistency + * does not cause write error because the zone will not be + * selected for write operation until it get discarded. + */ + f2fs_notice(sbi, "Valid blocks are not aligned with write " + "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + } ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len, GFP_NOFS); @@ -5020,18 +5031,27 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); - wp_segno = GET_SEGNO(sbi, wp_block); - wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); - - if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && - wp_sector_off == 0) - return 0; + /* + * When safely unmounted in the previous mount, we could use current + * segments. Otherwise, allocate new sections. + */ + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + + if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && + wp_sector_off == 0) + return 0; - f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " - "curseg[0x%x,0x%x] wp[0x%x,0x%x]", - type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); + f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " + "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, + cs->next_blkoff, wp_segno, wp_blkoff); + } else { + f2fs_notice(sbi, "Not successfully unmounted in the previous " + "mount"); + } f2fs_notice(sbi, "Assign new section to curseg[%d]: " "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 2ca8fb5d0dc4..8129be788bd5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -108,11 +108,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\ (sbi)->log_blocks_per_seg)) #define GET_SEC_FROM_SEG(sbi, segno) \ - (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 05f9f7b6ebf8..033af907c3b1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -562,6 +562,29 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, } #ifdef CONFIG_F2FS_FS_COMPRESSION +static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, + const char *new_ext, bool is_ext) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + int ext_cnt; + int i; + + if (is_ext) { + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + } else { + ext = F2FS_OPTION(sbi).noextensions; + ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + } + + for (i = 0; i < ext_cnt; i++) { + if (!strcasecmp(new_ext, ext[i])) + return true; + } + + return false; +} + /* * 1. The same extension name cannot not appear in both compress and non-compress extension * at the same time. @@ -1164,6 +1187,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } + if (is_compress_extension_exist(sbi, name, true)) { + kfree(name); + break; + } + strcpy(ext[ext_cnt], name); F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); @@ -1188,6 +1216,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } + if (is_compress_extension_exist(sbi, name, false)) { + kfree(name); + break; + } + strcpy(noext[noext_cnt], name); F2FS_OPTION(sbi).nocompress_ext_cnt++; kfree(name); @@ -1644,7 +1677,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); - if (err) { + if (err || f2fs_cp_error(sbi)) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); } @@ -2301,9 +2334,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_sb_flags; int err; bool need_restart_gc = false, need_stop_gc = false; - bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; + bool need_enable_checkpoint = false, need_disable_checkpoint = false; bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); @@ -2467,24 +2500,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || - !test_opt(sbi, MERGE_CHECKPOINT)) { - f2fs_stop_ckpt_thread(sbi); - need_restart_ckpt = true; - } else { - /* Flush if the prevous checkpoint, if exists. */ - f2fs_flush_ckpt_thread(sbi); - - err = f2fs_start_ckpt_thread(sbi); - if (err) { - f2fs_err(sbi, - "Failed to start F2FS issue_checkpoint_thread (%d)", - err); - goto restore_gc; - } - need_stop_ckpt = true; - } - /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -2496,7 +2511,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } else { err = f2fs_create_flush_cmd_control(sbi); if (err) - goto restore_ckpt; + goto restore_gc; need_stop_flush = true; } @@ -2518,8 +2533,31 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = f2fs_disable_checkpoint(sbi); if (err) goto restore_discard; + need_enable_checkpoint = true; } else { f2fs_enable_checkpoint(sbi); + need_disable_checkpoint = true; + } + } + + /* + * Place this routine at the end, since a new checkpoint would be + * triggered while remount and we need to take care of it before + * returning from remount. + */ + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_checkpoint; } } @@ -2537,6 +2575,13 @@ skip: adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_checkpoint: + if (need_enable_checkpoint) { + f2fs_enable_checkpoint(sbi); + } else if (need_disable_checkpoint) { + if (f2fs_disable_checkpoint(sbi)) + f2fs_warn(sbi, "checkpoint has not been disabled"); + } restore_discard: if (need_restart_discard) { if (f2fs_start_discard_thread(sbi)) @@ -2552,13 +2597,6 @@ restore_flush: clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); } -restore_ckpt: - if (need_restart_ckpt) { - if (f2fs_start_ckpt_thread(sbi)) - f2fs_warn(sbi, "background ckpt thread has stopped"); - } else if (need_stop_ckpt) { - f2fs_stop_ckpt_thread(sbi); - } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) @@ -3292,6 +3330,7 @@ static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, } static const struct export_operations f2fs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = f2fs_fh_to_dentry, .fh_to_parent = f2fs_fh_to_parent, .get_parent = f2fs_get_parent, @@ -3479,7 +3518,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - /* Currently, support 512/1024/2048/4096 bytes sector size */ + /* Currently, support 512/1024/2048/4096/16K bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || le32_to_cpu(raw_super->log_sectorsize) < @@ -4926,7 +4965,7 @@ static int __init init_f2fs_fs(void) int err; if (PAGE_SIZE != F2FS_BLKSIZE) { - printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n", + printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n", PAGE_SIZE, F2FS_BLKSIZE); return -EINVAL; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 4314456854f6..47e88b4d4e7d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -364,10 +364,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - err = -EFSCORRUPTED; + err = -ENODATA; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); goto out; @@ -584,13 +584,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - error = -EFSCORRUPTED; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); - goto cleanup; + break; } if (!prefix) @@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - +retry: error = read_all_xattrs(inode, ipage, &base_addr); if (error) return error; @@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index, /* find entry with wanted name. */ here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + if (!F2FS_I(inode)->i_xattr_nid) { + f2fs_notice(F2FS_I_SB(inode), + "recover xattr in inode (%lu)", inode->i_ino); + f2fs_recover_xattr_data(inode, NULL); + kfree(base_addr); + goto retry; + } + f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 3626eb585a98..c52e63e10d35 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -279,6 +279,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir) } const struct export_operations fat_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = fat_fh_to_dentry, .fh_to_parent = fat_fh_to_parent, .get_parent = fat_get_parent, diff --git a/fs/fhandle.c b/fs/fhandle.c index 6ea8d35a9382..18b3ba8dc8ea 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -26,12 +26,8 @@ static long do_sys_name_to_handle(const struct path *path, /* * We need to make sure whether the file system support decoding of * the file handle if decodeable file handle was requested. - * Otherwise, even empty export_operations are sufficient to opt-in - * to encoding FIDs. */ - if (!path->dentry->d_sb->s_export_op || - (!(fh_flags & EXPORT_FH_FID) && - !path->dentry->d_sb->s_export_op->fh_to_dentry)) + if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags)) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 310d73e254df..e6e2a2185e7c 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -76,6 +76,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) { struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); struct vxfs_sb *raw_sb = infp->vsi_raw; + u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); bufp->f_type = VXFS_SUPER_MAGIC; bufp->f_bsize = dentry->d_sb->s_blocksize; @@ -84,6 +85,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) bufp->f_bavail = 0; bufp->f_files = 0; bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree); + bufp->f_fsid = u64_to_fsid(id); bufp->f_namelen = VXFS_NAMELEN; return 0; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index caa8121ad99c..74d4f09d5827 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -999,7 +999,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, } *max_len = len; - return parent ? 0x82 : 0x81; + return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *fuse_fh_to_dentry(struct super_block *sb, @@ -1007,7 +1007,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb, { struct fuse_inode_handle handle; - if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + if ((fh_type != FILEID_INO64_GEN && + fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) return NULL; handle.nodeid = (u64) fid->raw[0] << 32; @@ -1021,7 +1022,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb, { struct fuse_inode_handle parent; - if (fh_type != 0x82 || fh_len < 6) + if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) return NULL; parent.nodeid = (u64) fid->raw[3] << 32; diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index d4deb2b19959..82f5b09c04e6 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -11,9 +11,9 @@ #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) -extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); -extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, - struct posix_acl *acl, int type); +struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); +int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 6b060fc9e260..9611bfceda4b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -155,7 +155,7 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE)) goto out; if (folio_test_checked(folio) || current->journal_info) goto out_ignore; @@ -214,12 +214,12 @@ static int gfs2_write_jdata_batch(struct address_space *mapping, unsigned nrblocks; int i; int ret; - int nr_pages = 0; + size_t size = 0; int nr_folios = folio_batch_count(fbatch); for (i = 0; i < nr_folios; i++) - nr_pages += folio_nr_pages(fbatch->folios[i]); - nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); + size += folio_size(fbatch->folios[i]); + nrblocks = size >> inode->i_blkbits; ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) @@ -403,27 +403,27 @@ static int gfs2_jdata_writepages(struct address_space *mapping, } /** - * stuffed_readpage - Fill in a Linux page with stuffed file data + * stuffed_readpage - Fill in a Linux folio with stuffed file data * @ip: the inode - * @page: the page + * @folio: the folio * * Returns: errno */ -static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *dibh; - u64 dsize = i_size_read(&ip->i_inode); - void *kaddr; + size_t i_size = i_size_read(&ip->i_inode); + void *data; int error; /* * Due to the order of unstuffing files and ->fault(), we can be - * asked for a zero page in the case of a stuffed file being extended, + * asked for a zero folio in the case of a stuffed file being extended, * so we need to supply one here. It doesn't happen often. */ - if (unlikely(page->index)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (unlikely(folio->index)) { + folio_zero_range(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); return 0; } @@ -431,13 +431,11 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) if (error) return error; - kaddr = kmap_local_page(page); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); - memset(kaddr + dsize, 0, PAGE_SIZE - dsize); - kunmap_local(kaddr); - flush_dcache_page(page); + data = dibh->b_data + sizeof(struct gfs2_dinode); + memcpy_to_folio(folio, 0, data, i_size); + folio_zero_range(folio, i_size, folio_size(folio) - i_size); brelse(dibh); - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } @@ -458,7 +456,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { error = iomap_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { - error = stuffed_readpage(ip, &folio->page); + error = stuffed_readpage(ip, folio); folio_unlock(folio); } else { error = mpage_read_folio(folio, gfs2_block_map); @@ -479,31 +477,29 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) * */ -int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, - unsigned size) +ssize_t gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, + size_t size) { struct address_space *mapping = ip->i_inode.i_mapping; unsigned long index = *pos >> PAGE_SHIFT; - unsigned offset = *pos & (PAGE_SIZE - 1); - unsigned copied = 0; - unsigned amt; - struct page *page; + size_t copied = 0; do { - page = read_cache_page(mapping, index, gfs2_read_folio, NULL); - if (IS_ERR(page)) { - if (PTR_ERR(page) == -EINTR) + size_t offset, chunk; + struct folio *folio; + + folio = read_cache_folio(mapping, index, gfs2_read_folio, NULL); + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -EINTR) continue; - return PTR_ERR(page); + return PTR_ERR(folio); } - amt = size - copied; - if (offset + size > PAGE_SIZE) - amt = PAGE_SIZE - offset; - memcpy_from_page(buf + copied, page, offset, amt); - put_page(page); - copied += amt; - index++; - offset = 0; + offset = *pos + copied - folio_pos(folio); + chunk = min(size - copied, folio_size(folio) - offset); + memcpy_from_folio(buf + copied, folio, offset, chunk); + index = folio_next_index(folio); + folio_put(folio); + copied += chunk; } while(copied < size); (*pos) += size; return size; diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h index f08322ef41cf..a10c4334d248 100644 --- a/fs/gfs2/aops.h +++ b/fs/gfs2/aops.h @@ -8,8 +8,8 @@ #include "incore.h" -extern void adjust_fs_space(struct inode *inode); -extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - size_t from, size_t len); +void adjust_fs_space(struct inode *inode); +void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + size_t from, size_t len); #endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6eb6f1bd9e34..d9ccfd27e4f1 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -104,7 +104,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio) and write it out to disk */ unsigned int n = 1; - error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) goto out_brelse; if (isdir) { @@ -315,6 +315,12 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) } } +static inline struct buffer_head * +metapath_dibh(struct metapath *mp) +{ + return mp->mp_bh[0]; +} + static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, unsigned int x, unsigned int h) { @@ -413,13 +419,12 @@ static void release_metapath(struct metapath *mp) * gfs2_extent_length - Returns length of an extent of blocks * @bh: The metadata block * @ptr: Current position in @bh - * @limit: Max extent length to return * @eob: Set to 1 if we hit "end of block" * * Returns: The length of the extent (minimum of one block) */ -static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob) +static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob) { const __be64 *end = (__be64 *)(bh->b_data + bh->b_size); const __be64 *first = ptr; @@ -658,7 +663,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *dibh = mp->mp_bh[0]; + struct buffer_head *dibh = metapath_dibh(mp); u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; size_t dblks = iomap->length >> inode->i_blkbits; @@ -700,7 +705,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, i = mp->mp_aheight; do { n = blks - alloced; - ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + ret = gfs2_alloc_blocks(ip, &bn, &n, 0); if (ret) goto out; alloced += n; @@ -911,7 +916,7 @@ unstuff: goto do_alloc; bh = mp->mp_bh[ip->i_height - 1]; - len = gfs2_extent_length(bh, ptr, len, &eob); + len = gfs2_extent_length(bh, ptr, &eob); iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; iomap->length = len << inode->i_blkbits; diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index e5b7d17131ed..4e8b1e8ebdf3 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -46,24 +46,24 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, extern const struct iomap_ops gfs2_iomap_ops; extern const struct iomap_writeback_ops gfs2_writeback_ops; -extern int gfs2_unstuff_dinode(struct gfs2_inode *ip); -extern int gfs2_block_map(struct inode *inode, sector_t lblock, - struct buffer_head *bh, int create); -extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, - struct iomap *iomap); -extern int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, - struct iomap *iomap); -extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, - unsigned int *extlen); -extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, - unsigned *extlen, bool *new); -extern int gfs2_setattr_size(struct inode *inode, u64 size); -extern int gfs2_truncatei_resume(struct gfs2_inode *ip); -extern int gfs2_file_dealloc(struct gfs2_inode *ip); -extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len); -extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); -extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); -extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length); +int gfs2_unstuff_dinode(struct gfs2_inode *ip); +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen); +int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned *extlen, bool *new); +int gfs2_setattr_size(struct inode *inode, u64 size); +int gfs2_truncatei_resume(struct gfs2_inode *ip); +int gfs2_file_dealloc(struct gfs2_inode *ip); +int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); +int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); +void gfs2_free_journal_extents(struct gfs2_jdesc *jd); +int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 61ddd03ea111..560e4624c09f 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -868,7 +868,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_dirent *dent; struct timespec64 tv = current_time(inode); - error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &bn, &n, 0); if (error) return NULL; bh = gfs2_meta_new(ip->i_gl, bn); diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 5b76480c17c9..25a857c78b53 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -23,32 +23,32 @@ struct gfs2_diradd { int save_loc; }; -extern struct inode *gfs2_dir_search(struct inode *dir, - const struct qstr *filename, - bool fail_on_exist); -extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, - const struct gfs2_inode *ip); -extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip, struct gfs2_diradd *da); +struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename, + bool fail_on_exist); +int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip, struct gfs2_diradd *da); static inline void gfs2_dir_no_add(struct gfs2_diradd *da) { brelse(da->bh); da->bh = NULL; } -extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); -extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, - struct file_ra_state *f_ra); -extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, - const struct gfs2_inode *nip, unsigned int new_type); +int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); +int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra); +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); -extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); -extern int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename, - struct gfs2_diradd *da); -extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, - struct buffer_head **bhp); -extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); +int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename, + struct gfs2_diradd *da); +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); +void gfs2_dir_hash_inval(struct gfs2_inode *ip); static inline u32 gfs2_disk_hash(const char *data, int len) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f2700477a300..4b66efc1a82a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -418,7 +418,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; u64 offset = page_offset(page); unsigned int data_blocks, ind_blocks, rblocks; vm_fault_t ret = VM_FAULT_LOCKED; @@ -1120,14 +1120,16 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out_unlock; - ret = file_update_time(file); - if (ret) - goto out_unlock; - if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; ssize_t buffered, ret2; + /* + * Note that under direct I/O, we don't allow and inode + * timestamp updates, so we're not calling file_update_time() + * here. + */ + ret = gfs2_file_direct_write(iocb, from, &gh); if (ret < 0 || !iov_iter_count(from)) goto out_unlock; @@ -1154,6 +1156,10 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!ret || ret2 > 0) ret += ret2; } else { + ret = file_update_time(file); + if (ret) + goto out_unlock; + ret = gfs2_file_buffered_write(iocb, from, &gh); if (likely(ret > 0)) ret = generic_write_sync(iocb, ret); @@ -1245,7 +1251,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes, max_blks; int error; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d5fa75eac0bf..d6bf1f8c25dc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1524,7 +1524,6 @@ fail: return; } list_add_tail(&gh->gh_list, insert_pt); - gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); spin_unlock(&gl->gl_lockref.lock); if (sdp->sd_lockstruct.ls_ops->lm_cancel) sdp->sd_lockstruct.ls_ops->lm_cancel(gl); diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index c8685ca7d2a2..61197598abfd 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -156,21 +156,6 @@ out: return gh; } -static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) -{ - return gl->gl_state == LM_ST_EXCLUSIVE; -} - -static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl) -{ - return gl->gl_state == LM_ST_DEFERRED; -} - -static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) -{ - return gl->gl_state == LM_ST_SHARED; -} - static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) { if (gl->gl_ops->go_flags & GLOF_ASPACE) { @@ -181,40 +166,40 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) return NULL; } -extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - int create, struct gfs2_glock **glp); -extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl); -extern void gfs2_glock_put(struct gfs2_glock *gl); -extern void gfs2_glock_queue_put(struct gfs2_glock *gl); +int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl); +void gfs2_glock_put(struct gfs2_glock *gl); +void gfs2_glock_queue_put(struct gfs2_glock *gl); -extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, - u16 flags, struct gfs2_holder *gh, - unsigned long ip); +void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + u16 flags, struct gfs2_holder *gh, + unsigned long ip); static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh) { __gfs2_holder_init(gl, state, flags, gh, _RET_IP_); } -extern void gfs2_holder_reinit(unsigned int state, u16 flags, - struct gfs2_holder *gh); -extern void gfs2_holder_uninit(struct gfs2_holder *gh); -extern int gfs2_glock_nq(struct gfs2_holder *gh); -extern int gfs2_glock_poll(struct gfs2_holder *gh); -extern int gfs2_instantiate(struct gfs2_holder *gh); -extern int gfs2_glock_holder_ready(struct gfs2_holder *gh); -extern int gfs2_glock_wait(struct gfs2_holder *gh); -extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_glock_dq(struct gfs2_holder *gh); -extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); -extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); -extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - unsigned int state, u16 flags, - struct gfs2_holder *gh); -extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, +void gfs2_holder_reinit(unsigned int state, u16 flags, + struct gfs2_holder *gh); +void gfs2_holder_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq(struct gfs2_holder *gh); +int gfs2_glock_poll(struct gfs2_holder *gh); +int gfs2_instantiate(struct gfs2_holder *gh); +int gfs2_glock_holder_ready(struct gfs2_holder *gh); +int gfs2_glock_wait(struct gfs2_holder *gh); +int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq(struct gfs2_holder *gh); +void gfs2_glock_dq_wait(struct gfs2_holder *gh); +void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, u16 flags, + struct gfs2_holder *gh); +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \ gfs2_dump_glock(NULL, gl, true); \ @@ -228,7 +213,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \ while (0) -extern __printf(2, 3) +__printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** @@ -256,27 +241,27 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, return error; } -extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); -extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); -extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); -extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); -extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); -extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); -extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); -extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); -extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); -extern void gfs2_glock_free(struct gfs2_glock *gl); +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); +void gfs2_cancel_delete_work(struct gfs2_glock *gl); +void gfs2_flush_delete_work(struct gfs2_sbd *sdp); +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); +void gfs2_glock_thaw(struct gfs2_sbd *sdp); +void gfs2_glock_add_to_lru(struct gfs2_glock *gl); +void gfs2_glock_free(struct gfs2_glock *gl); -extern int __init gfs2_glock_init(void); -extern void gfs2_glock_exit(void); +int __init gfs2_glock_init(void); +void gfs2_glock_exit(void); -extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp); -extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); -extern void gfs2_register_debugfs(void); -extern void gfs2_unregister_debugfs(void); +void gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +void gfs2_register_debugfs(void); +void gfs2_unregister_debugfs(void); -extern void glock_set_object(struct gfs2_glock *gl, void *object); -extern void glock_clear_object(struct gfs2_glock *gl, void *object); +void glock_set_object(struct gfs2_glock *gl, void *object); +void glock_clear_object(struct gfs2_glock *gl, void *object); extern const struct lm_lockops gfs2_dlm_ops; @@ -295,7 +280,7 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh) return !list_empty(&gh->gh_list); } -extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); -extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); +void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); +bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index e7d334c277a1..b41c78bd2cc0 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -615,18 +615,6 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl) } /** - * freeze_go_demote_ok - * @gl: the glock - * - * Always returns 0 - */ - -static int freeze_go_demote_ok(const struct gfs2_glock *gl) -{ - return 0; -} - -/** * iopen_go_callback - schedule the dcache entry for the inode to be deleted * @gl: the glock * @remote: true if this came from a different cluster node @@ -745,7 +733,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { const struct gfs2_glock_operations gfs2_freeze_glops = { .go_xmote_bh = freeze_go_xmote_bh, - .go_demote_ok = freeze_go_demote_ok, .go_callback = freeze_go_callback, .go_type = LM_TYPE_NONDISK, .go_flags = GLOF_NONDISK, diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 695898afcaf1..9341423798df 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -22,7 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; -extern int gfs2_inode_metasync(struct gfs2_glock *gl); -extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); +int gfs2_inode_metasync(struct gfs2_glock *gl); +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a8c95c5293c6..95a334d64da2 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -863,7 +863,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) preempt_enable(); } -extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); +struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip) { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 7fe77bc771e5..1b95db2c3aac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -266,17 +266,18 @@ fail_iput: } -struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) +/** + * gfs2_lookup_meta - Look up an inode in a metadata directory + * @dip: The directory + * @name: The name of the inode + */ +struct inode *gfs2_lookup_meta(struct inode *dip, const char *name) { struct qstr qstr; struct inode *inode; + gfs2_str2qstr(&qstr, name); inode = gfs2_lookupi(dip, &qstr, 1); - /* gfs2_lookupi has inconsistent callers: vfs - * related routines expect NULL for no entry found, - * gfs2_lookup_simple callers expect ENOENT - * and do not check for NULL. - */ if (IS_ERR_OR_NULL(inode)) return inode ? inode : ERR_PTR(-ENOENT); @@ -418,7 +419,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) if (error) goto out_ipreserv; - error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1); if (error) goto out_trans_end; @@ -1867,16 +1868,24 @@ out: int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { + int may_not_block = mask & MAY_NOT_BLOCK; struct gfs2_inode *ip; struct gfs2_holder i_gh; + struct gfs2_glock *gl; int error; gfs2_holder_mark_uninitialized(&i_gh); ip = GFS2_I(inode); - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (mask & MAY_NOT_BLOCK) + gl = rcu_dereference_check(ip->i_gl, !may_not_block); + if (unlikely(!gl)) { + /* inode is getting torn down, must be RCU mode */ + WARN_ON_ONCE(!may_not_block); + return -ECHILD; + } + if (gfs2_glock_is_locked_by_me(gl) == NULL) { + if (may_not_block) return -ECHILD; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; } @@ -1921,7 +1930,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) kuid_t ouid, nuid; kgid_t ogid, ngid; int error; - struct gfs2_alloc_parms ap; + struct gfs2_alloc_parms ap = {}; ouid = inode->i_uid; ogid = inode->i_gid; @@ -2154,7 +2163,7 @@ static int gfs2_update_time(struct inode *inode, int flags) int error; gh = gfs2_glock_is_locked_by_me(gl); - if (gh && !gfs2_glock_is_held_excl(gl)) { + if (gh && gl->gl_state != LM_ST_EXCLUSIVE) { gfs2_glock_dq(gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, gh); error = gfs2_glock_nq(gh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c8c5814e7295..fd15d1c6b6fb 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -13,9 +13,9 @@ #include "util.h" bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask); -extern int gfs2_internal_read(struct gfs2_inode *ip, - char *buf, loff_t *pos, unsigned size); -extern void gfs2_set_aops(struct inode *inode); +ssize_t gfs2_internal_read(struct gfs2_inode *ip, + char *buf, loff_t *pos, size_t size); +void gfs2_set_aops(struct inode *inode); static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { @@ -44,19 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip) static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) { - inode->i_blocks = blocks << - (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); + inode->i_blocks = blocks << (inode->i_blkbits - 9); } static inline u64 gfs2_get_inode_blocks(const struct inode *inode) { - return inode->i_blocks >> - (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); + return inode->i_blocks >> (inode->i_blkbits - 9); } static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) { - change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT; + change <<= inode->i_blkbits - 9; gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change)); inode->i_blocks += change; } @@ -88,33 +86,33 @@ err: return -EIO; } -extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - unsigned int blktype); -extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, - u64 no_formal_ino, - unsigned int blktype); - -extern int gfs2_inode_refresh(struct gfs2_inode *ip); - -extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root); -extern int gfs2_permission(struct mnt_idmap *idmap, - struct inode *inode, int mask); -extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); -extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); -extern int gfs2_open_common(struct inode *inode, struct file *file); -extern loff_t gfs2_seek_data(struct file *file, loff_t offset); -extern loff_t gfs2_seek_hole(struct file *file, loff_t offset); +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + unsigned int blktype); +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 no_formal_ino, + unsigned int blktype); + +int gfs2_inode_refresh(struct gfs2_inode *ip); + +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +int gfs2_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask); +struct inode *gfs2_lookup_meta(struct inode *dip, const char *name); +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +int gfs2_open_common(struct inode *inode, struct file *file); +loff_t gfs2_seek_data(struct file *file, loff_t offset); +loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; -extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int gfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); -extern void gfs2_set_inode_flags(struct inode *inode); - +int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int gfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa); +void gfs2_set_inode_flags(struct inode *inode); + #ifdef CONFIG_GFS2_FS_LOCKING_DLM extern const struct file_operations gfs2_file_fops; extern const struct file_operations gfs2_dir_fops; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 653cffcbf869..c27b05099c1e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -70,29 +70,29 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) } } -extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); -extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); -extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); -extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp); -extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); -extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); -extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, - unsigned int *extra_revokes); -extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, - unsigned int *extra_revokes); -extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, - u64 seq, u32 tail, u32 lblock, u32 flags, - blk_opf_t op_flags); -extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, - u32 type); -extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); -extern void log_flush_wait(struct gfs2_sbd *sdp); +void gfs2_ordered_del_inode(struct gfs2_inode *ip); +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); +void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +bool gfs2_log_is_empty(struct gfs2_sbd *sdp); +void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); +bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + u64 seq, u32 tail, u32 lblock, u32 flags, + blk_opf_t op_flags); +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + u32 type); +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); +void log_flush_wait(struct gfs2_sbd *sdp); -extern int gfs2_logd(void *data); -extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); -extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); -extern void gfs2_ail_drain(struct gfs2_sbd *sdp); +int gfs2_logd(void *data); +void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +void gfs2_glock_remove_revoke(struct gfs2_glock *gl); +void gfs2_flush_revokes(struct gfs2_sbd *sdp); +void gfs2_ail_drain(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 1412ffba1d44..07890c7b145d 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -11,16 +11,18 @@ #include "incore.h" extern const struct gfs2_log_operations *gfs2_log_ops[]; -extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); -extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); -extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, - struct page *page, unsigned size, unsigned offset, - u64 blkno); -extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); -extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); -extern int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, bool keep_cache); -extern void gfs2_drain_revokes(struct gfs2_sbd *sdp); + +void gfs2_log_incr_head(struct gfs2_sbd *sdp); +u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); +void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno); +void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, bool keep_cache); +void gfs2_drain_revokes(struct gfs2_sbd *sdp); + static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { return sdp->sd_ldptrs; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index d0a58cdd433a..831d988c2ceb 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -50,21 +50,21 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) return inode->i_sb->s_fs_info; } -extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); -extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - int rahead, struct buffer_head **bhp); -extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); -extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, - int create); +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + int rahead, struct buffer_head **bhp); +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, + int create); enum { REMOVE_JDATA = 0, REMOVE_META = 1, }; -extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); -extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); -extern int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, - struct buffer_head **bhp); +void gfs2_remove_from_journal(struct buffer_head *bh, int meta); +void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, + struct buffer_head **bhp); static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, struct buffer_head **bhp) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ecf789b7168c..b108c5d26839 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -292,8 +292,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) return error; } - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - - GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9; sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) / sizeof(u64); @@ -648,7 +647,7 @@ static int init_statfs(struct gfs2_sbd *sdp) struct gfs2_jdesc *jd; struct gfs2_inode *ip; - sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + sdp->sd_statfs_inode = gfs2_lookup_meta(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); @@ -657,7 +656,7 @@ static int init_statfs(struct gfs2_sbd *sdp) if (sdp->sd_args.ar_spectator) goto out; - pn = gfs2_lookup_simple(master, "per_node"); + pn = gfs2_lookup_meta(master, "per_node"); if (IS_ERR(pn)) { error = PTR_ERR(pn); fs_err(sdp, "can't find per_node directory: %d\n", error); @@ -674,7 +673,7 @@ static int init_statfs(struct gfs2_sbd *sdp) goto free_local; } sprintf(buf, "statfs_change%u", jd->jd_jid); - lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + lsi->si_sc_inode = gfs2_lookup_meta(pn, buf); if (IS_ERR(lsi->si_sc_inode)) { error = PTR_ERR(lsi->si_sc_inode); fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", @@ -739,7 +738,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_statfs; - sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); + sdp->sd_jindex = gfs2_lookup_meta(master, "jindex"); if (IS_ERR(sdp->sd_jindex)) { fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); @@ -888,7 +887,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail; /* Read in the resource index inode */ - sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); + sdp->sd_rindex = gfs2_lookup_meta(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); @@ -897,7 +896,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ - sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); + sdp->sd_quota_inode = gfs2_lookup_meta(master, "quota"); if (IS_ERR(sdp->sd_quota_inode)) { error = PTR_ERR(sdp->sd_quota_inode); fs_err(sdp, "can't get quota file inode: %d\n", error); @@ -941,7 +940,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_qc_gh; - pn = gfs2_lookup_simple(master, "per_node"); + pn = gfs2_lookup_meta(master, "per_node"); if (IS_ERR(pn)) { error = PTR_ERR(pn); fs_err(sdp, "can't find per_node directory: %d\n", error); @@ -949,7 +948,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) } sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); - sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); + sdp->sd_qc_inode = gfs2_lookup_meta(pn, buf); if (IS_ERR(sdp->sd_qc_inode)) { error = PTR_ERR(sdp->sd_qc_inode); fs_err(sdp, "can't find local \"qc\" file: %d\n", error); @@ -1187,10 +1186,9 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) /* Set up the buffer cache and fill in some fake block size values to allow us to read-in the on-disk superblock. */ - sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK); + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512); sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - - GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9; sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; @@ -1278,10 +1276,8 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) if (!sb_rdonly(sb)) { error = init_threads(sdp); - if (error) { - gfs2_withdraw_delayed(sdp); + if (error) goto fail_per_node; - } } error = gfs2_freeze_lock_shared(sdp); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 5cbbc1a46a92..95dae7838b4e 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -470,6 +470,17 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, (sync_gen && (qd->qd_sync_gen >= *sync_gen))) return 0; + /* + * If qd_change is 0 it means a pending quota change was negated. + * We should not sync it, but we still have a qd reference and slot + * reference taken by gfs2_quota_change -> do_qc that need to be put. + */ + if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) { + slot_put(qd); + qd_put(qd); + return 0; + } + if (!lockref_get_not_dead(&qd->qd_lockref)) return 0; @@ -912,7 +923,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { struct gfs2_sbd *sdp = (*qda)->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; unsigned int data_blocks, ind_blocks; struct gfs2_holder *ghs, i_gh; unsigned int qx, x; @@ -1086,8 +1097,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) u32 x; int error; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && - sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; error = gfs2_quota_hold(ip, uid, gid); @@ -1194,17 +1204,16 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) #define MAX_LINE 256 -static int print_message(struct gfs2_quota_data *qd, char *type) +static void print_message(struct gfs2_quota_data *qd, char *type) { struct gfs2_sbd *sdp = qd->qd_sbd; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) { fs_info(sdp, "quota %s for %s %u\n", type, (qd->qd_id.type == USRQUOTA) ? "user" : "group", from_kqid(&init_user_ns, qd->qd_id)); - - return 0; + } } /** @@ -1274,7 +1283,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, * HZ)) { quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); - error = print_message(qd, "warning"); + print_message(qd, "warning"); + error = 0; qd->qd_last_warn = jiffies; } } @@ -1288,8 +1298,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 x; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - if ((sdp->sd_args.ar_quota != GFS2_QUOTA_ON && - sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) || + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF || gfs2_assert_warn(sdp, change)) return; if (ip->i_diskflags & GFS2_DIF_SYSTEM) @@ -1746,7 +1755,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 36f54b426b0c..f462d9cb3087 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -15,27 +15,27 @@ struct gfs2_sbd; #define NO_UID_QUOTA_CHANGE INVALID_UID #define NO_GID_QUOTA_CHANGE INVALID_GID -extern int gfs2_qa_get(struct gfs2_inode *ip); -extern void gfs2_qa_put(struct gfs2_inode *ip); -extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); -extern void gfs2_quota_unhold(struct gfs2_inode *ip); +int gfs2_qa_get(struct gfs2_inode *ip); +void gfs2_qa_put(struct gfs2_inode *ip); +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +void gfs2_quota_unhold(struct gfs2_inode *ip); -extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); -extern void gfs2_quota_unlock(struct gfs2_inode *ip); +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +void gfs2_quota_unlock(struct gfs2_inode *ip); -extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, - struct gfs2_alloc_parms *ap); -extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - kuid_t uid, kgid_t gid); +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap); +void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + kuid_t uid, kgid_t gid); -extern int gfs2_quota_sync(struct super_block *sb, int type); -extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); +int gfs2_quota_sync(struct super_block *sb, int type); +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); -extern int gfs2_quota_init(struct gfs2_sbd *sdp); -extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); -extern int gfs2_quotad(void *data); +int gfs2_quota_init(struct gfs2_sbd *sdp); +void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +int gfs2_quotad(void *data); -extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); +void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) @@ -50,8 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && - sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) + if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) return 0; ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); if (ret) @@ -63,6 +62,7 @@ extern const struct quotactl_ops gfs2_quotactl_ops; int __init gfs2_qd_shrinker_init(void); void gfs2_qd_shrinker_exit(void); extern struct list_lru gfs2_qd_lru; -extern void __init gfs2_quota_hash_init(void); + +void __init gfs2_quota_hash_init(void); #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 7a0c9d0b7503..6a0fd42e1120 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -17,18 +17,18 @@ static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk) *blk = 0; } -extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh); -extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); -extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); -extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); +int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +void gfs2_revoke_clean(struct gfs2_jdesc *jd); -extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); -extern void gfs2_recover_func(struct work_struct *work); -extern int __get_log_header(struct gfs2_sbd *sdp, - const struct gfs2_log_header *lh, unsigned int blkno, - struct gfs2_log_header_host *head); +int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +void gfs2_recover_func(struct work_struct *work); +int __get_log_header(struct gfs2_sbd *sdp, + const struct gfs2_log_header *lh, unsigned int blkno, + struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9308190895c8..c2060203b98a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2411,13 +2411,12 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, * @bn: Used to return the starting block number * @nblocks: requested number of blocks/extent length (value/result) * @dinode: 1 if we're allocating a dinode block, else 0 - * @generation: the generation number of the inode * * Returns: 0 or error */ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, - bool dinode, u64 *generation) + bool dinode) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; @@ -2477,10 +2476,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, rbm.rgd->rd_free -= *nblocks; spin_unlock(&rbm.rgd->rd_rsspin); if (dinode) { + u64 generation; + rbm.rgd->rd_dinodes++; - *generation = rbm.rgd->rd_igeneration++; - if (*generation == 0) - *generation = rbm.rgd->rd_igeneration++; + generation = rbm.rgd->rd_igeneration++; + if (generation == 0) + generation = rbm.rgd->rd_igeneration++; + ip->i_generation = generation; } gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 00b30cf893af..8d20e99385db 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -22,38 +22,38 @@ struct gfs2_rgrpd; struct gfs2_sbd; struct gfs2_holder; -extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); -extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); -extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); -extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -extern int gfs2_rindex_update(struct gfs2_sbd *sdp); -extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); -extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl); -extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +int gfs2_rindex_update(struct gfs2_sbd *sdp); +void gfs2_free_clones(struct gfs2_rgrpd *rgd); +int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl); +void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); -extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); +struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); #define GFS2_AF_ORLOV 1 -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, - struct gfs2_alloc_parms *ap); -extern void gfs2_inplace_release(struct gfs2_inode *ip); - -extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, - bool dinode, u64 *generation); - -extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip); -extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, - u64 bstart, u32 blen, int meta); -extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, - u64 bstart, u32 blen); -extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); -extern void gfs2_unlink_di(struct inode *inode); -extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, - unsigned int type); +int gfs2_inplace_reserve(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap); +void gfs2_inplace_release(struct gfs2_inode *ip); + +int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, + bool dinode); + +void gfs2_rs_deltree(struct gfs2_blkreserv *rs); +void gfs2_rs_delete(struct gfs2_inode *ip); +void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, int meta); +void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen); +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +void gfs2_unlink_di(struct inode *inode); +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); struct gfs2_rgrp_list { unsigned int rl_rgrps; @@ -62,18 +62,19 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, - u64 block); -extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, - unsigned int state, u16 flags); -extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); -extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, - const char *fs_id_buf); -extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, - struct buffer_head *bh, - const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); -extern int gfs2_fitrim(struct file *filp, void __user *argp); +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, + u64 block); +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, + unsigned int state, u16 flags); +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +u64 gfs2_ri_total(struct gfs2_sbd *sdp); +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, + const char *fs_id_buf); +int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, + u64 *ptrimmed); +int gfs2_fitrim(struct file *filp, void __user *argp); /* This is how to tell if a reservation is in the rgrp tree: */ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) @@ -88,9 +89,9 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) return first <= block && block < last; } -extern void check_and_update_goal(struct gfs2_inode *ip); +void check_and_update_goal(struct gfs2_inode *ip); -extern void rgrp_lock_local(struct gfs2_rgrpd *rgd); -extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd); +void rgrp_lock_local(struct gfs2_rgrpd *rgd); +void rgrp_unlock_local(struct gfs2_rgrpd *rgd); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 52a878fa7139..d21c04a22d73 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -602,13 +602,15 @@ restart: } spin_unlock(&sdp->sd_jindex_spin); - if (!sb_rdonly(sb)) { + if (!sb_rdonly(sb)) gfs2_make_fs_ro(sdp); - } - if (gfs2_withdrawn(sdp)) { - gfs2_destroy_threads(sdp); + else { + if (gfs2_withdrawn(sdp)) + gfs2_destroy_threads(sdp); + gfs2_quota_cleanup(sdp); } + WARN_ON(gfs2_withdrawing(sdp)); /* At this point, we're through modifying the disk */ @@ -1006,6 +1008,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sc.sc_dinodes + sc.sc_free; buf->f_ffree = sc.sc_free; buf->f_namelen = GFS2_FNAMESIZE; + buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } @@ -1299,18 +1302,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) * As a last resort, if another node keeps holding the iopen glock * without showing any activity on the inode glock, we will eventually * time out and fail the iopen glock upgrade. - * - * Note that we're passing the LM_FLAG_TRY_1CB flag to the first - * locking request as an optimization to notify lock holders as soon as - * possible. Without that flag, they'd be notified implicitly by the - * second locking request. */ - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh); - error = gfs2_glock_nq(gh); - if (error != GLR_TRYFAILED) - return !error; - gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh); error = gfs2_glock_nq(gh); if (error) @@ -1550,7 +1543,7 @@ out: wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put_eventually(ip->i_gl); - ip->i_gl = NULL; + rcu_assign_pointer(ip->i_gl, NULL); } } @@ -1576,7 +1569,7 @@ static void gfs2_free_inode(struct inode *inode) kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } -extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) +void free_local_statfs_inodes(struct gfs2_sbd *sdp) { struct local_statfs_inode *lsi, *safe; @@ -1591,8 +1584,8 @@ extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) } } -extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, - unsigned int index) +struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index) { struct local_statfs_inode *lsi; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index b4ddf6244586..b27a774d9580 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -15,7 +15,7 @@ #define GFS2_FS_FORMAT_MIN (1801) #define GFS2_FS_FORMAT_MAX (1802) -extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); +void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) { @@ -26,33 +26,33 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -extern void gfs2_jindex_free(struct gfs2_sbd *sdp); +void gfs2_jindex_free(struct gfs2_sbd *sdp); -extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); -extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); -extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, - struct gfs2_inode **ipp); +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +int gfs2_jdesc_check(struct gfs2_jdesc *jd); +int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); -extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp); -extern void gfs2_online_uevent(struct gfs2_sbd *sdp); -extern void gfs2_destroy_threads(struct gfs2_sbd *sdp); -extern int gfs2_statfs_init(struct gfs2_sbd *sdp); -extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, - s64 dinodes); -extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, - const void *buf); -extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, - void *buf); -extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); -extern int gfs2_statfs_sync(struct super_block *sb, int type); -extern void gfs2_freeze_func(struct work_struct *work); -extern void gfs2_thaw_freeze_initiator(struct super_block *sb); +int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +void gfs2_make_fs_ro(struct gfs2_sbd *sdp); +void gfs2_online_uevent(struct gfs2_sbd *sdp); +void gfs2_destroy_threads(struct gfs2_sbd *sdp); +int gfs2_statfs_init(struct gfs2_sbd *sdp); +void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); +int gfs2_statfs_sync(struct super_block *sb, int type); +void gfs2_freeze_func(struct work_struct *work); +void gfs2_thaw_freeze_initiator(struct super_block *sb); -extern void free_local_statfs_inodes(struct gfs2_sbd *sdp); -extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, - unsigned int index); -extern void free_sbd(struct gfs2_sbd *sdp); +void free_local_statfs_inodes(struct gfs2_sbd *sdp); +struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index); +void free_sbd(struct gfs2_sbd *sdp); extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index c76ad9a4c75a..f8ce5302280d 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -34,17 +34,17 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned return rgd->rd_length; } -extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, - unsigned int blocks, unsigned int revokes, - unsigned long ip); -extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes); - -extern void gfs2_trans_end(struct gfs2_sbd *sdp); -extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); -extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); -extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); -extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr); +int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip); +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); + +void gfs2_trans_end(struct gfs2_sbd *sdp); +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); +void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index cdb839529175..11c9d59b6889 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -147,10 +147,10 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, unsigned int line); -extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, - bool verbose); -extern int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp); -extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); +int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + bool verbose); +int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp); +void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); #define gfs2_io_error(sdp) \ gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 79d5c5559512..8c96ba6230d1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -639,7 +639,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) u64 block; int error; - error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, block, 1); @@ -701,7 +701,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, int mh_size = sizeof(struct gfs2_meta_header); unsigned int n = 1; - error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, block, 1); @@ -1002,7 +1002,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, } else { u64 blk; unsigned int n = 1; - error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &blk, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, blk, 1); diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index 2aed9d7d483d..eb12eb7e37c1 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -50,14 +50,14 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -extern int __gfs2_xattr_set(struct inode *inode, const char *name, - const void *value, size_t size, - int flags, int type); -extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern int gfs2_ea_dealloc(struct gfs2_inode *ip); +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ -extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); +int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); #endif /* __EATTR_DOT_H__ */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 54b3d489b6a7..f757d4f7ad98 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1179,7 +1179,9 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); struct hstate *h = hstate_inode(d_inode(dentry)); + u64 id = huge_encode_dev(dentry->d_sb->s_dev); + buf->f_fsid = u64_to_fsid(id); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); if (sbinfo) { diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 7ea37f49f1e1..f99591a634b4 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -150,6 +150,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child) } static const struct export_operations jffs2_export_ops = { + .encode_fh = generic_encode_ino32_fh, .get_parent = jffs2_get_parent, .fh_to_dentry = jffs2_fh_to_dentry, .fh_to_parent = jffs2_fh_to_parent, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 966826c394ee..8d8e556bd610 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -896,6 +896,7 @@ static const struct super_operations jfs_super_operations = { }; static const struct export_operations jfs_export_operations = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = jfs_fh_to_dentry, .fh_to_parent = jfs_fh_to_parent, .get_parent = jfs_get_parent, diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index aaa76410e550..f0cb729e9a97 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -854,6 +854,33 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) return ret; } +static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + loff_t ret; + + /* + * @of->mutex nests outside active ref and is primarily to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + return -ENODEV; + } + + ops = kernfs_ops(of->kn); + if (ops->llseek) + ret = ops->llseek(of, offset, whence); + else + ret = generic_file_llseek(file, offset, whence); + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + return ret; +} + static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; @@ -956,7 +983,7 @@ EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, - .llseek = generic_file_llseek, + .llseek = kernfs_fop_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, diff --git a/fs/libfs.c b/fs/libfs.c index abe2b5a40ba1..e9440d55073c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -41,6 +41,9 @@ EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { + u64 id = huge_encode_dev(dentry->d_sb->s_dev); + + buf->f_fsid = u64_to_fsid(id); buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; @@ -1310,6 +1313,47 @@ ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, EXPORT_SYMBOL_GPL(simple_attr_write_signed); /** + * generic_encode_ino32_fh - generic export_operations->encode_fh function + * @inode: the object to encode + * @fh: where to store the file handle fragment + * @max_len: maximum length to store there (in 4 byte units) + * @parent: parent directory inode, if wanted + * + * This generic encode_fh function assumes that the 32 inode number + * is suitable for locating an inode, and that the generation number + * can be used to check that it is still valid. It places them in the + * filehandle fragment where export_decode_fh expects to find them. + */ +int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent) +{ + struct fid *fid = (void *)fh; + int len = *max_len; + int type = FILEID_INO32_GEN; + + if (parent && (len < 4)) { + *max_len = 4; + return FILEID_INVALID; + } else if (len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + len = 2; + fid->i32.ino = inode->i_ino; + fid->i32.gen = inode->i_generation; + if (parent) { + fid->i32.parent_ino = parent->i_ino; + fid->i32.parent_gen = parent->i_generation; + len = 4; + type = FILEID_INO32_GEN_PARENT; + } + *max_len = len; + return type; +} +EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); + +/** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b7da17e53007..7b641095a665 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -426,8 +426,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid) return -EINVAL; } - if (!inode->i_sb->s_export_op || - !inode->i_sb->s_export_op->fh_to_dentry) { + if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) { dprintk("exp_export: export of invalid fs type.\n"); return -EINVAL; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 45aecdc302f4..4d765c72496f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1595,7 +1595,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) * file handles so user can use name_to_handle_at() to compare fids * reported with events to the file handle of watched objects. */ - if (!nop) + if (!exportfs_can_encode_fid(nop)) return -EOPNOTSUPP; /* @@ -1603,7 +1603,7 @@ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) * supports decoding file handles, so user has a way to map back the * reported fids to filesystem objects. */ - if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry) + if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) return -EOPNOTSUPP; return 0; diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index ab44f2db533b..d7498ddc4a72 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -384,6 +384,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, * and due to using iget() whereas NTFS needs ntfs_iget(). */ const struct export_operations ntfs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .get_parent = ntfs_get_parent, /* Find the parent of a given directory. */ .fh_to_dentry = ntfs_fh_to_dentry, diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index f763e3256ccc..9153dffde950 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -811,6 +811,7 @@ static int ntfs_nfs_commit_metadata(struct inode *inode) } static const struct export_operations ntfs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ntfs_fh_to_dentry, .fh_to_parent = ntfs_fh_to_parent, .get_parent = ntfs3_get_parent, diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 4e173d56b11f..5648954f8588 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_OVERLAY_FS) += overlay.o overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \ - copy_up.o export.o params.o + copy_up.o export.o params.o xattrs.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index ada3fcc9c6d5..4382881b0709 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -252,7 +252,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, return PTR_ERR(old_file); /* Try to use clone_file_range to clone up within the same fs */ + ovl_start_write(dentry); cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); + ovl_end_write(dentry); if (cloned == len) goto out_fput; /* Couldn't clone, so now we try to copy the data */ @@ -287,8 +289,12 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, * it may not recognize all kind of holes and sometimes * only skips partial of hole area. However, it will be * enough for most of the use cases. + * + * We do not hold upper sb_writers throughout the loop to avert + * lockdep warning with llseek of lower file in nested overlay: + * - upper sb_writers + * -- lower ovl_inode_lock (ovl_llseek) */ - if (skip_hole && data_pos < old_pos) { data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA); if (data_pos > old_pos) { @@ -303,9 +309,11 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, } } + ovl_start_write(dentry); bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); + ovl_end_write(dentry); if (bytes <= 0) { error = bytes; break; @@ -426,29 +434,29 @@ out_err: return ERR_PTR(err); } -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, - struct dentry *upper) +struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin) { - const struct ovl_fh *fh = NULL; - int err; - /* * When lower layer doesn't support export operations store a 'null' fh, * so we can use the overlay.origin xattr to distignuish between a copy * up and a pure upper inode. */ - if (ovl_can_decode_fh(lower->d_sb)) { - fh = ovl_encode_real_fh(ofs, lower, false); - if (IS_ERR(fh)) - return PTR_ERR(fh); - } + if (!ovl_can_decode_fh(origin->d_sb)) + return NULL; + + return ovl_encode_real_fh(ofs, origin, false); +} + +int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, + struct dentry *upper) +{ + int err; /* * Do not fail when upper doesn't support xattrs. */ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf, fh ? fh->fb.len : 0, 0); - kfree(fh); /* Ignore -EPERM from setting "user.*" on symlink/special */ return err == -EPERM ? 0 : err; @@ -476,7 +484,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, * * Caller must hold i_mutex on indexdir. */ -static int ovl_create_index(struct dentry *dentry, struct dentry *origin, +static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, struct dentry *upper) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); @@ -502,7 +510,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry)))) return -EIO; - err = ovl_get_index_name(ofs, origin, &name); + err = ovl_get_index_name_fh(fh, &name); if (err) return err; @@ -541,6 +549,7 @@ struct ovl_copy_up_ctx { struct dentry *destdir; struct qstr destname; struct dentry *workdir; + const struct ovl_fh *origin_fh; bool origin; bool indexed; bool metacopy; @@ -555,14 +564,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(upperdir); + ovl_start_write(c->dentry); + /* Mark parent "impure" because it may now contain non-pure upper */ err = ovl_set_impure(c->parent, upperdir); if (err) - return err; + goto out; err = ovl_set_nlink_lower(c->dentry); if (err) - return err; + goto out; inode_lock_nested(udir, I_MUTEX_PARENT); upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, @@ -581,10 +592,12 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) } inode_unlock(udir); if (err) - return err; + goto out; err = ovl_set_nlink_upper(c->dentry); +out: + ovl_end_write(c->dentry); return err; } @@ -637,7 +650,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) * hard link. */ if (c->origin) { - err = ovl_set_origin(ofs, c->lowerpath.dentry, temp); + err = ovl_set_origin_fh(ofs, c->origin_fh, temp); if (err) return err; } @@ -719,21 +732,19 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) .link = c->link }; - /* workdir and destdir could be the same when copying up to indexdir */ - err = -EIO; - if (lock_rename(c->workdir, c->destdir) != NULL) - goto unlock; - err = ovl_prep_cu_creds(c->dentry, &cc); if (err) - goto unlock; + return err; + ovl_start_write(c->dentry); + inode_lock(wdir); temp = ovl_create_temp(ofs, c->workdir, &cattr); + inode_unlock(wdir); + ovl_end_write(c->dentry); ovl_revert_cu_creds(&cc); - err = PTR_ERR(temp); if (IS_ERR(temp)) - goto unlock; + return PTR_ERR(temp); /* * Copy up data first and then xattrs. Writing data after @@ -741,15 +752,28 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) */ path.dentry = temp; err = ovl_copy_up_data(c, &path); - if (err) + /* + * We cannot hold lock_rename() throughout this helper, because or + * lock ordering with sb_writers, which shouldn't be held when calling + * ovl_copy_up_data(), so lock workdir and destdir and make sure that + * temp wasn't moved before copy up completion or cleanup. + * If temp was moved, abort without the cleanup. + */ + ovl_start_write(c->dentry); + if (lock_rename(c->workdir, c->destdir) != NULL || + temp->d_parent != c->workdir) { + err = -EIO; + goto unlock; + } else if (err) { goto cleanup; + } err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; if (S_ISDIR(c->stat.mode) && c->indexed) { - err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); + err = ovl_create_index(c->dentry, c->origin_fh, temp); if (err) goto cleanup; } @@ -779,6 +803,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) ovl_set_flag(OVL_WHITEOUTS, inode); unlock: unlock_rename(c->workdir, c->destdir); + ovl_end_write(c->dentry); return err; @@ -802,9 +827,10 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; + ovl_start_write(c->dentry); tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + ovl_end_write(c->dentry); ovl_revert_cu_creds(&cc); - if (IS_ERR(tmpfile)) return PTR_ERR(tmpfile); @@ -815,9 +841,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) goto out_fput; } + ovl_start_write(c->dentry); + err = ovl_copy_up_metadata(c, temp); if (err) - goto out_fput; + goto out; inode_lock_nested(udir, I_MUTEX_PARENT); @@ -831,7 +859,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_unlock(udir); if (err) - goto out_fput; + goto out; if (c->metacopy_digest) ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); @@ -843,6 +871,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) ovl_set_upperdata(d_inode(c->dentry)); ovl_inode_update(d_inode(c->dentry), dget(temp)); +out: + ovl_end_write(c->dentry); out_fput: fput(tmpfile); return err; @@ -861,6 +891,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) { int err; struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct dentry *origin = c->lowerpath.dentry; + struct ovl_fh *fh = NULL; bool to_index = false; /* @@ -877,25 +909,35 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) to_index = true; } - if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) + if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) { + fh = ovl_get_origin_fh(ofs, origin); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + /* origin_fh may be NULL */ + c->origin_fh = fh; c->origin = true; + } if (to_index) { c->destdir = ovl_indexdir(c->dentry->d_sb); - err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname); + err = ovl_get_index_name(ofs, origin, &c->destname); if (err) - return err; + goto out_free_fh; } else if (WARN_ON(!c->parent)) { /* Disconnected dentry must be copied up to index dir */ - return -EIO; + err = -EIO; + goto out_free_fh; } else { /* * Mark parent "impure" because it may now contain non-pure * upper */ + ovl_start_write(c->dentry); err = ovl_set_impure(c->parent, c->destdir); + ovl_end_write(c->dentry); if (err) - return err; + goto out_free_fh; } /* Should we copyup with O_TMPFILE or with workdir? */ @@ -909,6 +951,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) if (c->indexed) ovl_set_flag(OVL_INDEX, d_inode(c->dentry)); + ovl_start_write(c->dentry); if (to_index) { /* Initialize nlink for copy up of disconnected dentry */ err = ovl_set_nlink_upper(c->dentry); @@ -923,10 +966,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry)); } + ovl_end_write(c->dentry); out: if (to_index) kfree(c->destname.name); +out_free_fh: + kfree(fh); return err; } @@ -1011,15 +1057,16 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * Writing to upper file will clear security.capability xattr. We * don't want that to happen for normal copy-up operation. */ + ovl_start_write(c->dentry); if (capability) { err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS, capability, cap_size, 0); - if (err) - goto out_free; } - - - err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); + if (!err) { + err = ovl_removexattr(ofs, upperpath.dentry, + OVL_XATTR_METACOPY); + } + ovl_end_write(c->dentry); if (err) goto out_free; @@ -1170,17 +1217,10 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) int ovl_maybe_copy_up(struct dentry *dentry, int flags) { - int err = 0; - - if (ovl_open_need_copy_up(dentry, flags)) { - err = ovl_want_write(dentry); - if (!err) { - err = ovl_copy_up_flags(dentry, flags); - ovl_drop_write(dentry); - } - } + if (!ovl_open_need_copy_up(dentry, flags)) + return 0; - return err; + return ovl_copy_up_flags(dentry, flags); } int ovl_copy_up_with_data(struct dentry *dentry) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 033fc0458a3d..aab3f5d93556 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -477,7 +477,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, goto out_unlock; err = -ESTALE; - if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) + if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper)) goto out_dput; newdentry = ovl_create_temp(ofs, workdir, cattr); @@ -559,10 +559,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct cred *override_cred; struct dentry *parent = dentry->d_parent; - err = ovl_copy_up(parent); - if (err) - return err; - old_cred = ovl_override_creds(dentry->d_sb); /* @@ -626,6 +622,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, .link = link, }; + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + err = ovl_want_write(dentry); if (err) goto out; @@ -700,28 +700,24 @@ static int ovl_link(struct dentry *old, struct inode *newdir, int err; struct inode *inode; - err = ovl_want_write(old); + err = ovl_copy_up(old); if (err) goto out; - err = ovl_copy_up(old); + err = ovl_copy_up(new->d_parent); if (err) - goto out_drop_write; + goto out; - err = ovl_copy_up(new->d_parent); + err = ovl_nlink_start(old); if (err) - goto out_drop_write; + goto out; if (ovl_is_metacopy_dentry(old)) { err = ovl_set_link_redirect(old); if (err) - goto out_drop_write; + goto out_nlink_end; } - err = ovl_nlink_start(old); - if (err) - goto out_drop_write; - inode = d_inode(old); ihold(inode); @@ -731,9 +727,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) iput(inode); +out_nlink_end: ovl_nlink_end(old); -out_drop_write: - ovl_drop_write(old); out: return err; } @@ -891,17 +886,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out; } - err = ovl_want_write(dentry); - if (err) - goto out; - err = ovl_copy_up(dentry->d_parent); if (err) - goto out_drop_write; + goto out; err = ovl_nlink_start(dentry); if (err) - goto out_drop_write; + goto out; old_cred = ovl_override_creds(dentry->d_sb); if (!lower_positive) @@ -926,8 +917,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (ovl_dentry_upper(dentry)) ovl_copyattr(d_inode(dentry)); -out_drop_write: - ovl_drop_write(dentry); out: ovl_cache_free(&list); return err; @@ -1131,29 +1120,32 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } } - err = ovl_want_write(old); - if (err) - goto out; - err = ovl_copy_up(old); if (err) - goto out_drop_write; + goto out; err = ovl_copy_up(new->d_parent); if (err) - goto out_drop_write; + goto out; if (!overwrite) { err = ovl_copy_up(new); if (err) - goto out_drop_write; + goto out; } else if (d_inode(new)) { err = ovl_nlink_start(new); if (err) - goto out_drop_write; + goto out; update_nlink = true; } + if (!update_nlink) { + /* ovl_nlink_start() took ovl_want_write() */ + err = ovl_want_write(old); + if (err) + goto out; + } + old_cred = ovl_override_creds(old->d_sb); if (!list_empty(&list)) { @@ -1219,7 +1211,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } } else { if (!d_is_negative(newdentry)) { - if (!new_opaque || !ovl_is_whiteout(newdentry)) + if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry)) goto out_dput; } else { if (flags & RENAME_EXCHANGE) @@ -1286,8 +1278,8 @@ out_revert_creds: revert_creds(old_cred); if (update_nlink) ovl_nlink_end(new); -out_drop_write: - ovl_drop_write(old); + else + ovl_drop_write(old); out: dput(opaquedir); ovl_cache_free(&list); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 26b782c53910..7e16bbcad95e 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -23,12 +23,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry) if (ovl_dentry_upper(dentry)) return 0; - err = ovl_want_write(dentry); - if (!err) { - err = ovl_copy_up(dentry); - ovl_drop_write(dentry); - } - + err = ovl_copy_up(dentry); if (err) { pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index ec3671ca140c..131621daeb13 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -15,10 +15,15 @@ #include <linux/fs.h> #include "overlayfs.h" +#include "../internal.h" /* for sb_init_dio_done_wq */ + struct ovl_aio_req { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; + /* used for aio completion */ + struct work_struct work; + long res; }; static struct kmem_cache *ovl_aio_request_cachep; @@ -235,6 +240,12 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) return ret; } +static void ovl_file_modified(struct file *file) +{ + /* Update size/mtime */ + ovl_copyattr(file_inode(file)); +} + static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; @@ -263,20 +274,12 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -static rwf_t ovl_iocb_to_rwf(int ifl) +#define OVL_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) { - rwf_t flags = 0; - - if (ifl & IOCB_NOWAIT) - flags |= RWF_NOWAIT; - if (ifl & IOCB_HIPRI) - flags |= RWF_HIPRI; - if (ifl & IOCB_DSYNC) - flags |= RWF_DSYNC; - if (ifl & IOCB_SYNC) - flags |= RWF_SYNC; - - return flags; + return (__force rwf_t)(flags & OVL_IOCB_MASK); } static inline void ovl_aio_put(struct ovl_aio_req *aio_req) @@ -293,10 +296,8 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) struct kiocb *orig_iocb = aio_req->orig_iocb; if (iocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(orig_iocb->ki_filp); - kiocb_end_write(iocb); - ovl_copyattr(inode); + ovl_file_modified(orig_iocb->ki_filp); } orig_iocb->ki_pos = iocb->ki_pos; @@ -313,6 +314,37 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res) orig_iocb->ki_complete(orig_iocb, res); } +static void ovl_aio_complete_work(struct work_struct *work) +{ + struct ovl_aio_req *aio_req = container_of(work, + struct ovl_aio_req, work); + + ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); +} + +static void ovl_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct ovl_aio_req *aio_req = container_of(iocb, + struct ovl_aio_req, iocb); + struct kiocb *orig_iocb = aio_req->orig_iocb; + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio_req->res = res; + INIT_WORK(&aio_req->work, ovl_aio_complete_work); + queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio_req->work); +} + +static int ovl_init_aio_done_wq(struct super_block *sb) +{ + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -334,8 +366,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb->ki_flags)); + rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); + + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); } else { struct ovl_aio_req *aio_req; @@ -401,15 +434,20 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(ifl); + file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(ifl)); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); file_end_write(real.file); /* Update size */ - ovl_copyattr(inode); + ovl_file_modified(file); } else { struct ovl_aio_req *aio_req; + ret = ovl_init_aio_done_wq(inode->i_sb); + if (ret) + goto out; + ret = -ENOMEM; aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); if (!aio_req) @@ -418,7 +456,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) aio_req->orig_iocb = iocb; kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_rw_complete; + aio_req->iocb.ki_complete = ovl_aio_queue_completion; refcount_set(&aio_req->ref, 2); kiocb_start_write(&aio_req->iocb); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); @@ -492,7 +530,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, file_end_write(real.file); /* Update size */ - ovl_copyattr(inode); + ovl_file_modified(out); revert_creds(old_cred); fdput(real); @@ -573,7 +611,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len revert_creds(old_cred); /* Update size */ - ovl_copyattr(inode); + ovl_file_modified(file); fdput(real); @@ -657,7 +695,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, revert_creds(old_cred); /* Update size */ - ovl_copyattr(inode_out); + ovl_file_modified(file_out); fdput(real_in); fdput(real_out); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b6e98a7d36ce..345b8f161ca4 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -32,10 +32,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (err) return err; - err = ovl_want_write(dentry); - if (err) - goto out; - if (attr->ia_valid & ATTR_SIZE) { /* Truncate should trigger data copy up as well */ full_copy_up = true; @@ -54,7 +50,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, winode = d_inode(upperdentry); err = get_write_access(winode); if (err) - goto out_drop_write; + goto out; } if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) @@ -78,6 +74,10 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ attr->ia_valid &= ~ATTR_OPEN; + err = ovl_want_write(dentry); + if (err) + goto out_put_write; + inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = ovl_do_notify_change(ofs, upperdentry, attr); @@ -85,12 +85,12 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); + ovl_drop_write(dentry); +out_put_write: if (winode) put_write_access(winode); } -out_drop_write: - ovl_drop_write(dentry); out: return err; } @@ -339,130 +339,6 @@ static const char *ovl_get_link(struct dentry *dentry, return p; } -bool ovl_is_private_xattr(struct super_block *sb, const char *name) -{ - struct ovl_fs *ofs = OVL_FS(sb); - - if (ofs->config.userxattr) - return strncmp(name, OVL_XATTR_USER_PREFIX, - sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0; - else - return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, - sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0; -} - -int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - int err; - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); - struct dentry *upperdentry = ovl_i_dentry_upper(inode); - struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); - struct path realpath; - const struct cred *old_cred; - - err = ovl_want_write(dentry); - if (err) - goto out; - - if (!value && !upperdentry) { - ovl_path_lower(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); - revert_creds(old_cred); - if (err < 0) - goto out_drop_write; - } - - if (!upperdentry) { - err = ovl_copy_up(dentry); - if (err) - goto out_drop_write; - - realdentry = ovl_dentry_upper(dentry); - } - - old_cred = ovl_override_creds(dentry->d_sb); - if (value) { - err = ovl_do_setxattr(ofs, realdentry, name, value, size, - flags); - } else { - WARN_ON(flags != XATTR_REPLACE); - err = ovl_do_removexattr(ofs, realdentry, name); - } - revert_creds(old_cred); - - /* copy c/mtime */ - ovl_copyattr(inode); - -out_drop_write: - ovl_drop_write(dentry); -out: - return err; -} - -int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, - void *value, size_t size) -{ - ssize_t res; - const struct cred *old_cred; - struct path realpath; - - ovl_i_path_real(inode, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); - revert_creds(old_cred); - return res; -} - -static bool ovl_can_list(struct super_block *sb, const char *s) -{ - /* Never list private (.overlay) */ - if (ovl_is_private_xattr(sb, s)) - return false; - - /* List all non-trusted xattrs */ - if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) - return true; - - /* list other trusted for superuser only */ - return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); -} - -ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) -{ - struct dentry *realdentry = ovl_dentry_real(dentry); - ssize_t res; - size_t len; - char *s; - const struct cred *old_cred; - - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_listxattr(realdentry, list, size); - revert_creds(old_cred); - if (res <= 0 || size == 0) - return res; - - /* filter out private xattrs */ - for (s = list, len = res; len;) { - size_t slen = strnlen(s, len) + 1; - - /* underlying fs providing us with an broken xattr list? */ - if (WARN_ON(slen > len)) - return -EIO; - - len -= slen; - if (!ovl_can_list(dentry->d_sb, s)) { - res -= slen; - memmove(s, s + slen, len); - } else { - s += slen; - } - } - - return res; -} - #ifdef CONFIG_FS_POSIX_ACL /* * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone @@ -611,10 +487,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); - err = ovl_want_write(dentry); - if (err) - return err; - /* * If ACL is to be removed from a lower file, check if it exists in * the first place before copying it up. @@ -630,7 +502,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, revert_creds(old_cred); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); - goto out_drop_write; + goto out; } posix_acl_release(real_acl); } @@ -638,23 +510,26 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, if (!upperdentry) { err = ovl_copy_up(dentry); if (err) - goto out_drop_write; + goto out; realdentry = ovl_dentry_upper(dentry); } + err = ovl_want_write(dentry); + if (err) + goto out; + old_cred = ovl_override_creds(dentry->d_sb); if (acl) err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else err = ovl_do_remove_acl(ofs, realdentry, acl_name); revert_creds(old_cred); + ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); - -out_drop_write: - ovl_drop_write(dentry); +out: return err; } @@ -778,14 +653,14 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, unsigned int flags; int err; - err = ovl_want_write(dentry); - if (err) - goto out; - err = ovl_copy_up(dentry); if (!err) { ovl_path_real(dentry, &upperpath); + err = ovl_want_write(dentry); + if (err) + goto out; + old_cred = ovl_override_creds(inode->i_sb); /* * Store immutable/append-only flags in xattr and clear them @@ -798,6 +673,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, if (!err) err = ovl_real_fileattr_set(&upperpath, fa); revert_creds(old_cred); + ovl_drop_write(dentry); /* * Merge real inode flags with inode flags read from @@ -812,7 +688,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, /* Update ctime */ ovl_copyattr(inode); } - ovl_drop_write(dentry); out: return err; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 80391c687c2a..03bc8d5dfa31 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -251,7 +251,10 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, err = -EREMOTE; goto out_err; } - if (ovl_is_whiteout(this)) { + + path.dentry = this; + path.mnt = d->mnt; + if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) { d->stop = d->opaque = true; goto put_and_out; } @@ -264,8 +267,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto put_and_out; } - path.dentry = this; - path.mnt = d->mnt; if (!d_can_lookup(this)) { if (d->is_dir || !last_element) { d->stop = true; @@ -438,7 +439,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, else if (IS_ERR(origin)) return PTR_ERR(origin); - if (upperdentry && !ovl_is_whiteout(upperdentry) && + if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) && inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) goto invalid; @@ -507,6 +508,19 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, return err; } +int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const struct ovl_fh *fh, + bool is_upper, bool set) +{ + int err; + + err = ovl_verify_fh(ofs, dentry, ox, fh); + if (set && err == -ENODATA) + err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + + return err; +} + /* * Verify that @real dentry matches the file handle stored in xattr @name. * @@ -515,9 +529,9 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, * * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ -int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, struct dentry *real, bool is_upper, - bool set) +int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, + bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; @@ -530,9 +544,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, goto fail; } - err = ovl_verify_fh(ofs, dentry, ox, fh); - if (set && err == -ENODATA) - err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set); if (err) goto fail; @@ -548,6 +560,7 @@ fail: goto out; } + /* Get upper dentry from index */ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected) @@ -684,7 +697,7 @@ orphan: goto out; } -static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) +int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name) { char *n, *s; @@ -873,20 +886,27 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, struct dentry *lower, struct dentry *upper) { + const struct ovl_fh *fh; int err; if (ovl_check_origin_xattr(ofs, upper)) return 0; + fh = ovl_get_origin_fh(ofs, lower); + if (IS_ERR(fh)) + return PTR_ERR(fh); + err = ovl_want_write(dentry); if (err) - return err; + goto out; - err = ovl_set_origin(ofs, lower, upper); + err = ovl_set_origin_fh(ofs, fh, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); ovl_drop_write(dentry); +out: + kfree(fh); return err; } @@ -1383,7 +1403,11 @@ bool ovl_lower_positive(struct dentry *dentry) break; } } else { - positive = !ovl_is_whiteout(this); + struct path path = { + .dentry = this, + .mnt = parentpath->layer->mnt, + }; + positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); done = true; dput(this); } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 9817b2dcb132..ca88b2636a57 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -28,7 +28,16 @@ enum ovl_path_type { #define OVL_XATTR_NAMESPACE "overlay." #define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) #define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1) + +#define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1) +#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX +#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1) +#define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX +#define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1) enum ovl_xattr { OVL_XATTR_OPAQUE, @@ -40,6 +49,8 @@ enum ovl_xattr { OVL_XATTR_UUID, OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, + OVL_XATTR_XWHITEOUT, + OVL_XATTR_XWHITEOUTS, }; enum ovl_inode_flag { @@ -398,6 +409,10 @@ static inline bool ovl_open_flags_need_copy_up(int flags) } /* util.c */ +int ovl_get_write_access(struct dentry *dentry); +void ovl_put_write_access(struct dentry *dentry); +void ovl_start_write(struct dentry *dentry); +void ovl_end_write(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); @@ -460,6 +475,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_inode_version_get(struct inode *inode); bool ovl_is_whiteout(struct dentry *dentry); +bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path); struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); @@ -467,9 +483,21 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags); bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); +bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); +bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); +static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs, + struct dentry *upperdentry) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + return ovl_path_is_whiteout(ofs, &upperpath); +} + static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *upperdentry) { @@ -624,11 +652,15 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, struct dentry *real, bool is_upper, - bool set); + enum ovl_xattr ox, const struct ovl_fh *fh, + bool is_upper, bool set); +int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, + bool is_upper, bool set); struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); +int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name); int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name); struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); @@ -640,17 +672,24 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); +static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper, + const struct ovl_fh *fh, bool set) +{ + return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set); +} + static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool set) { - return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin, - false, set); + return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin, + false, set); } static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, struct dentry *upper, bool set) { - return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set); + return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper, + true, set); } /* readdir.c */ @@ -684,17 +723,8 @@ int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); -int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr); -int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, - struct kstat *stat, u32 request_mask, unsigned int flags); int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags); -int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, - void *value, size_t size); -ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, @@ -815,8 +845,9 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentr int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, - struct dentry *upper); +struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); +int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, + struct dentry *upper); /* export.c */ extern const struct export_operations ovl_export_operations; @@ -830,3 +861,12 @@ static inline bool ovl_force_readonly(struct ovl_fs *ofs) { return (!ovl_upper_mnt(ofs) || !ofs->workdir); } + +/* xattr.c */ + +const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs); +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index f6ff23fd101c..ddab9ea267d1 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -43,8 +43,10 @@ module_param_named(metacopy, ovl_metacopy_def, bool, 0644); MODULE_PARM_DESC(metacopy, "Default to on or off for the metadata only copy up feature"); -enum { +enum ovl_opt { Opt_lowerdir, + Opt_lowerdir_add, + Opt_datadir_add, Opt_upperdir, Opt_workdir, Opt_default_permissions, @@ -140,8 +142,11 @@ static int ovl_verity_mode_def(void) #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) + const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), + fsparam_string("lowerdir+", Opt_lowerdir_add), + fsparam_string("datadir+", Opt_datadir_add), fsparam_string("upperdir", Opt_upperdir), fsparam_string("workdir", Opt_workdir), fsparam_flag("default_permissions", Opt_default_permissions), @@ -238,19 +243,8 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("failed to resolve '%s': %i\n", name, err); goto out; } - err = -EINVAL; - if (ovl_dentry_weird(path->dentry)) { - pr_err("filesystem on '%s' not supported\n", name); - goto out_put; - } - if (!d_is_dir(path->dentry)) { - pr_err("'%s' not a directory\n", name); - goto out_put; - } return 0; -out_put: - path_put_init(path); out: return err; } @@ -268,7 +262,7 @@ static void ovl_unescape(char *s) } } -static int ovl_mount_dir(const char *name, struct path *path, bool upper) +static int ovl_mount_dir(const char *name, struct path *path) { int err = -ENOMEM; char *tmp = kstrdup(name, GFP_KERNEL); @@ -276,68 +270,147 @@ static int ovl_mount_dir(const char *name, struct path *path, bool upper) if (tmp) { ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); - - if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) { - pr_err("filesystem on '%s' not supported as upperdir\n", - tmp); - path_put_init(path); - err = -EINVAL; - } kfree(tmp); } return err; } -static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc, - bool workdir) +static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, + enum ovl_opt layer, const char *name, bool upper) { - int err; - struct ovl_fs *ofs = fc->s_fs_info; - struct ovl_config *config = &ofs->config; struct ovl_fs_context *ctx = fc->fs_private; - struct path path; - char *dup; - err = ovl_mount_dir(name, &path, true); - if (err) - return err; + if (ovl_dentry_weird(path->dentry)) + return invalfc(fc, "filesystem on %s not supported", name); + + if (!d_is_dir(path->dentry)) + return invalfc(fc, "%s is not a directory", name); + /* * Check whether upper path is read-only here to report failures * early. Don't forget to recheck when the superblock is created * as the mount attributes could change. */ - if (__mnt_is_readonly(path.mnt)) { - path_put(&path); - return -EINVAL; + if (upper) { + if (path->dentry->d_flags & DCACHE_OP_REAL) + return invalfc(fc, "filesystem on %s not supported as upperdir", name); + if (__mnt_is_readonly(path->mnt)) + return invalfc(fc, "filesystem on %s is read-only", name); + } else { + if (ctx->lowerdir_all && layer != Opt_lowerdir) + return invalfc(fc, "lowerdir+ and datadir+ cannot follow lowerdir"); + if (ctx->nr_data && layer == Opt_lowerdir_add) + return invalfc(fc, "regular lower layers cannot follow data layers"); + if (ctx->nr == OVL_MAX_STACK) + return invalfc(fc, "too many lower directories, limit is %d", + OVL_MAX_STACK); } + return 0; +} - dup = kstrdup(name, GFP_KERNEL); - if (!dup) { - path_put(&path); +static int ovl_ctx_realloc_lower(struct fs_context *fc) +{ + struct ovl_fs_context *ctx = fc->fs_private; + struct ovl_fs_context_layer *l; + size_t nr; + + if (ctx->nr < ctx->capacity) + return 0; + + nr = min_t(size_t, max(4096 / sizeof(*l), ctx->capacity * 2), + OVL_MAX_STACK); + l = krealloc_array(ctx->lower, nr, sizeof(*l), GFP_KERNEL_ACCOUNT); + if (!l) return -ENOMEM; + + ctx->lower = l; + ctx->capacity = nr; + return 0; +} + +static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, + struct path *path, char **pname) +{ + struct ovl_fs *ofs = fc->s_fs_info; + struct ovl_config *config = &ofs->config; + struct ovl_fs_context *ctx = fc->fs_private; + struct ovl_fs_context_layer *l; + + switch (layer) { + case Opt_workdir: + swap(config->workdir, *pname); + swap(ctx->work, *path); + break; + case Opt_upperdir: + swap(config->upperdir, *pname); + swap(ctx->upper, *path); + break; + case Opt_datadir_add: + ctx->nr_data++; + fallthrough; + case Opt_lowerdir_add: + WARN_ON(ctx->nr >= ctx->capacity); + l = &ctx->lower[ctx->nr++]; + memset(l, 0, sizeof(*l)); + swap(l->name, *pname); + swap(l->path, *path); + break; + default: + WARN_ON(1); } +} - if (workdir) { - kfree(config->workdir); - config->workdir = dup; - path_put(&ctx->work); - ctx->work = path; - } else { - kfree(config->upperdir); - config->upperdir = dup; - path_put(&ctx->upper); - ctx->upper = path; +static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, + enum ovl_opt layer) +{ + char *name = kstrdup(param->string, GFP_KERNEL); + bool upper = (layer == Opt_upperdir || layer == Opt_workdir); + struct path path; + int err; + + if (!name) + return -ENOMEM; + + if (upper) + err = ovl_mount_dir(name, &path); + else + err = ovl_mount_dir_noesc(name, &path); + if (err) + goto out_free; + + err = ovl_mount_dir_check(fc, &path, layer, name, upper); + if (err) + goto out_put; + + if (!upper) { + err = ovl_ctx_realloc_lower(fc); + if (err) + goto out_put; } - return 0; + + /* Store the user provided path string in ctx to show in mountinfo */ + ovl_add_layer(fc, layer, &path, &name); + +out_put: + path_put(&path); +out_free: + kfree(name); + return err; } -static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx) +static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx) { - for (size_t nr = 0; nr < ctx->nr; nr++) { - path_put(&ctx->lower[nr].path); - kfree(ctx->lower[nr].name); - ctx->lower[nr].name = NULL; + struct ovl_fs_context_layer *l = ctx->lower; + + // Reset old user provided lowerdir string + kfree(ctx->lowerdir_all); + ctx->lowerdir_all = NULL; + + for (size_t nr = 0; nr < ctx->nr; nr++, l++) { + path_put(&l->path); + kfree(l->name); + l->name = NULL; } ctx->nr = 0; ctx->nr_data = 0; @@ -346,7 +419,7 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx) /* * Parse lowerdir= mount option: * - * (1) lowerdir=/lower1:/lower2:/lower3::/data1::/data2 + * e.g.: lowerdir=/lower1:/lower2:/lower3::/data1::/data2 * Set "/lower1", "/lower2", and "/lower3" as lower layers and * "/data1" and "/data2" as data lower layers. Any existing lower * layers are replaced. @@ -356,9 +429,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) int err; struct ovl_fs_context *ctx = fc->fs_private; struct ovl_fs_context_layer *l; - char *dup = NULL, *dup_iter; + char *dup = NULL, *iter; ssize_t nr_lower = 0, nr = 0, nr_data = 0; - bool append = false, data_layer = false; + bool data_layer = false; /* * Ensure we're backwards compatible with mount(2) @@ -366,16 +439,21 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) */ /* drop all existing lower layers */ - if (!*name) { - ovl_parse_param_drop_lowerdir(ctx); + ovl_reset_lowerdirs(ctx); + + if (!*name) return 0; - } if (*name == ':') { pr_err("cannot append lower layer"); return -EINVAL; } + // Store user provided lowerdir string to show in mount options + ctx->lowerdir_all = kstrdup(name, GFP_KERNEL); + if (!ctx->lowerdir_all) + return -ENOMEM; + dup = kstrdup(name, GFP_KERNEL); if (!dup) return -ENOMEM; @@ -385,36 +463,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) if (nr_lower < 0) goto out_err; - if ((nr_lower > OVL_MAX_STACK) || - (append && (size_add(ctx->nr, nr_lower) > OVL_MAX_STACK))) { + if (nr_lower > OVL_MAX_STACK) { pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_err; } - if (!append) - ovl_parse_param_drop_lowerdir(ctx); - - /* - * (1) append - * - * We want nr <= nr_lower <= capacity We know nr > 0 and nr <= - * capacity. If nr == 0 this wouldn't be append. If nr + - * nr_lower is <= capacity then nr <= nr_lower <= capacity - * already holds. If nr + nr_lower exceeds capacity, we realloc. - * - * (2) replace - * - * Ensure we're backwards compatible with mount(2) which allows - * "lowerdir=/a:/b:/c,lowerdir=/d:/e:/f" causing the last - * specified lowerdir mount option to win. - * - * We want nr <= nr_lower <= capacity We know either (i) nr == 0 - * or (ii) nr > 0. We also know nr_lower > 0. The capacity - * could've been changed multiple times already so we only know - * nr <= capacity. If nr + nr_lower > capacity we realloc, - * otherwise nr <= nr_lower <= capacity holds already. - */ - nr_lower += ctx->nr; if (nr_lower > ctx->capacity) { err = -ENOMEM; l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), @@ -426,41 +479,21 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) ctx->capacity = nr_lower; } - /* - * (3) By (1) and (2) we know nr <= nr_lower <= capacity. - * (4) If ctx->nr == 0 => replace - * We have verified above that the lowerdir mount option - * isn't an append, i.e., the lowerdir mount option - * doesn't start with ":" or "::". - * (4.1) The lowerdir mount options only contains regular lower - * layers ":". - * => Nothing to verify. - * (4.2) The lowerdir mount options contains regular ":" and - * data "::" layers. - * => We need to verify that data lower layers "::" aren't - * followed by regular ":" lower layers - * (5) If ctx->nr > 0 => append - * We know that there's at least one regular layer - * otherwise we would've failed when parsing the previous - * lowerdir mount option. - * (5.1) The lowerdir mount option is a regular layer ":" append - * => We need to verify that no data layers have been - * specified before. - * (5.2) The lowerdir mount option is a data layer "::" append - * We know that there's at least one regular layer or - * other data layers. => There's nothing to verify. - */ - dup_iter = dup; - for (nr = ctx->nr; nr < nr_lower; nr++) { - l = &ctx->lower[nr]; + iter = dup; + l = ctx->lower; + for (nr = 0; nr < nr_lower; nr++, l++) { memset(l, 0, sizeof(*l)); - err = ovl_mount_dir(dup_iter, &l->path, false); + err = ovl_mount_dir(iter, &l->path); + if (err) + goto out_put; + + err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false); if (err) goto out_put; err = -ENOMEM; - l->name = kstrdup(dup_iter, GFP_KERNEL_ACCOUNT); + l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT); if (!l->name) goto out_put; @@ -472,8 +505,8 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) break; err = -EINVAL; - dup_iter = strchr(dup_iter, '\0') + 1; - if (*dup_iter) { + iter = strchr(iter, '\0') + 1; + if (*iter) { /* * This is a regular layer so we require that * there are no data layers. @@ -489,7 +522,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) /* This is a data lower layer. */ data_layer = true; - dup_iter++; + iter++; } ctx->nr = nr_lower; ctx->nr_data += nr_data; @@ -497,21 +530,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) return 0; out_put: - /* - * We know nr >= ctx->nr < nr_lower. If we failed somewhere - * we want to undo until nr == ctx->nr. This is correct for - * both ctx->nr == 0 and ctx->nr > 0. - */ - for (; nr >= ctx->nr; nr--) { - l = &ctx->lower[nr]; - kfree(l->name); - l->name = NULL; - path_put(&l->path); - - /* don't overflow */ - if (nr == 0) - break; - } + ovl_reset_lowerdirs(ctx); out_err: kfree(dup); @@ -556,11 +575,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_lowerdir: err = ovl_parse_param_lowerdir(param->string, fc); break; + case Opt_lowerdir_add: + case Opt_datadir_add: case Opt_upperdir: - fallthrough; case Opt_workdir: - err = ovl_parse_param_upperdir(param->string, fc, - (Opt_workdir == opt)); + err = ovl_parse_layer(fc, param, opt); break; case Opt_default_permissions: config->default_permissions = true; @@ -617,7 +636,7 @@ static int ovl_get_tree(struct fs_context *fc) static inline void ovl_fs_context_free(struct ovl_fs_context *ctx) { - ovl_parse_param_drop_lowerdir(ctx); + ovl_reset_lowerdirs(ctx); path_put(&ctx->upper); path_put(&ctx->work); kfree(ctx->lower); @@ -933,23 +952,28 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; struct ovl_fs *ofs = OVL_FS(sb); - size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer; + size_t nr, nr_merged_lower, nr_lower = 0; + char **lowerdirs = ofs->config.lowerdirs; /* - * lowerdirs[] starts from offset 1, then - * >= 0 regular lower layers prefixed with : and - * >= 0 data-only lower layers prefixed with :: - * - * we need to escase comma and space like seq_show_option() does and - * we also need to escape the colon separator from lowerdir paths. + * lowerdirs[0] holds the colon separated list that user provided + * with lowerdir mount option. + * lowerdirs[1..numlayer] hold the lowerdir paths that were added + * using the lowerdir+ and datadir+ mount options. + * For now, we do not allow mixing the legacy lowerdir mount option + * with the new lowerdir+ and datadir+ mount options. */ - seq_puts(m, ",lowerdir="); - for (nr = 1; nr < ofs->numlayer; nr++) { - if (nr > 1) - seq_putc(m, ':'); - if (nr >= nr_merged_lower) - seq_putc(m, ':'); - seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\"); + if (lowerdirs[0]) { + seq_show_option(m, "lowerdir", lowerdirs[0]); + } else { + nr_lower = ofs->numlayer; + nr_merged_lower = nr_lower - ofs->numdatalayer; + } + for (nr = 1; nr < nr_lower; nr++) { + if (nr < nr_merged_lower) + seq_show_option(m, "lowerdir+", lowerdirs[nr]); + else + seq_show_option(m, "datadir+", lowerdirs[nr]); } if (ofs->config.upperdir) { seq_show_option(m, "upperdir", ofs->config.upperdir); diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h index 8750da68ab2a..c96d93982021 100644 --- a/fs/overlayfs/params.h +++ b/fs/overlayfs/params.h @@ -32,6 +32,7 @@ struct ovl_fs_context { size_t nr_data; struct ovl_opt_set set; struct ovl_fs_context_layer *lower; + char *lowerdir_all; /* user provided lowerdir string */ }; int ovl_init_fs_context(struct fs_context *fc); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index de39e067ae65..a490fc47c3e7 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -25,6 +25,7 @@ struct ovl_cache_entry { struct ovl_cache_entry *next_maybe_whiteout; bool is_upper; bool is_whiteout; + bool check_xwhiteout; char name[]; }; @@ -47,6 +48,7 @@ struct ovl_readdir_data { int err; bool is_upper; bool d_type_supported; + bool in_xwhiteouts_dir; }; struct ovl_dir_file { @@ -162,6 +164,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, p->ino = 0; p->is_upper = rdd->is_upper; p->is_whiteout = false; + /* Defer check for overlay.whiteout to ovl_iterate() */ + p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; @@ -301,6 +305,8 @@ static inline int ovl_dir_read(const struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); + rdd->in_xwhiteouts_dir = rdd->dentry && + ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { @@ -447,7 +453,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, } /* - * Set d_ino for upper entries. Non-upper entries should always report + * Set d_ino for upper entries if needed. Non-upper entries should always report * the uppermost real inode ino and should not call this function. * * When not all layer are on same fs, report real ino also for upper. @@ -455,8 +461,11 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, * When all layers are on the same fs, and upper has a reference to * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). + * + * Also checks the overlay.whiteout xattr by doing a full lookup which will return + * negative in this case. */ -static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p) +static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino) { struct dentry *dir = path->dentry; @@ -467,7 +476,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry int xinobits = ovl_xino_bits(ofs); int err = 0; - if (!ovl_same_dev(ofs)) + if (!ovl_same_dev(ofs) && !p->check_xwhiteout) goto out; if (p->name[0] == '.') { @@ -481,6 +490,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry goto get; } } + /* This checks also for xwhiteouts */ this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ @@ -494,6 +504,9 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry } get: + if (!ovl_same_dev(ofs) || !update_ino) + goto out; + type = ovl_path_type(this); if (OVL_TYPE_ORIGIN(type)) { struct kstat stat; @@ -572,7 +585,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list, list_for_each_entry_safe(p, n, list, l_node) { if (strcmp(p->name, ".") != 0 && strcmp(p->name, "..") != 0) { - err = ovl_cache_update_ino(path, p); + err = ovl_cache_update(path, p, true); if (err) return err; } @@ -778,13 +791,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); if (!p->is_whiteout) { - if (!p->ino) { - err = ovl_cache_update_ino(&file->f_path, p); + if (!p->ino || p->check_xwhiteout) { + err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) goto out; } } - /* ovl_cache_update_ino() sets is_whiteout on stale entry */ + /* ovl_cache_update() sets is_whiteout on stale entry */ if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 17864a8d2b85..a0967bb25003 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -445,68 +445,6 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) return ok; } -static int ovl_own_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return -EOPNOTSUPP; -} - -static int ovl_own_xattr_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static int ovl_other_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return ovl_xattr_get(dentry, inode, name, buffer, size); -} - -static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - return ovl_xattr_set(dentry, inode, name, value, size, flags); -} - -static const struct xattr_handler ovl_own_trusted_xattr_handler = { - .prefix = OVL_XATTR_TRUSTED_PREFIX, - .get = ovl_own_xattr_get, - .set = ovl_own_xattr_set, -}; - -static const struct xattr_handler ovl_own_user_xattr_handler = { - .prefix = OVL_XATTR_USER_PREFIX, - .get = ovl_own_xattr_get, - .set = ovl_own_xattr_set, -}; - -static const struct xattr_handler ovl_other_xattr_handler = { - .prefix = "", /* catch all */ - .get = ovl_other_xattr_get, - .set = ovl_other_xattr_set, -}; - -static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = { - &ovl_own_trusted_xattr_handler, - &ovl_other_xattr_handler, - NULL -}; - -static const struct xattr_handler * const ovl_user_xattr_handlers[] = { - &ovl_own_user_xattr_handler, - &ovl_other_xattr_handler, - NULL -}; - static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, struct inode **ptrap, const char *name) { @@ -647,7 +585,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) if (IS_ERR(whiteout)) goto cleanup_temp; - err = ovl_is_whiteout(whiteout); + err = ovl_upper_is_whiteout(ofs, whiteout); /* Best effort cleanup of whiteout and temp file */ if (err) @@ -887,15 +825,20 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, { struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *indexdir; + struct dentry *origin = ovl_lowerstack(oe)->dentry; + const struct ovl_fh *fh; int err; + fh = ovl_get_origin_fh(ofs, origin); + if (IS_ERR(fh)) + return PTR_ERR(fh); + err = mnt_want_write(mnt); if (err) - return err; + goto out_free_fh; /* Verify lower root is upper root origin */ - err = ovl_verify_origin(ofs, upperpath->dentry, - ovl_lowerstack(oe)->dentry, true); + err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true); if (err) { pr_err("failed to verify upper root origin\n"); goto out; @@ -927,9 +870,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, * directory entries. */ if (ovl_check_origin_xattr(ofs, ofs->indexdir)) { - err = ovl_verify_set_fh(ofs, ofs->indexdir, - OVL_XATTR_ORIGIN, - upperpath->dentry, true, false); + err = ovl_verify_origin_xattr(ofs, ofs->indexdir, + OVL_XATTR_ORIGIN, + upperpath->dentry, true, + false); if (err) pr_err("failed to verify index dir 'origin' xattr\n"); } @@ -947,6 +891,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, out: mnt_drop_write(mnt); +out_free_fh: + kfree(fh); return err; } @@ -1382,8 +1328,11 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) ofs->layers = layers; /* * Layer 0 is reserved for upper even if there's no upper. - * For consistency, config.lowerdirs[0] is NULL. + * config.lowerdirs[0] is used for storing the user provided colon + * separated lowerdir string. */ + ofs->config.lowerdirs[0] = ctx->lowerdir_all; + ctx->lowerdir_all = NULL; ofs->numlayer = 1; sb->s_stack_depth = 0; @@ -1493,8 +1442,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); sb->s_magic = OVERLAYFS_SUPER_MAGIC; - sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : - ovl_trusted_xattr_handlers; + sb->s_xattr = ovl_xattr_handlers(ofs); sb->s_fs_info = ofs; #ifdef CONFIG_FS_POSIX_ACL sb->s_flags |= SB_POSIXACL; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 868afd8834c3..50a201e9cd39 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -17,12 +17,38 @@ #include <linux/ratelimit.h> #include "overlayfs.h" +/* Get write access to upper mnt - may fail if upper sb was remounted ro */ +int ovl_get_write_access(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + return mnt_get_write_access(ovl_upper_mnt(ofs)); +} + +/* Get write access to upper sb - may block if upper sb is frozen */ +void ovl_start_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + sb_start_write(ovl_upper_mnt(ofs)->mnt_sb); +} + int ovl_want_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return mnt_want_write(ovl_upper_mnt(ofs)); } +void ovl_put_write_access(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + mnt_put_write_access(ovl_upper_mnt(ofs)); +} + +void ovl_end_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + sb_end_write(ovl_upper_mnt(ofs)->mnt_sb); +} + void ovl_drop_write(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); @@ -55,7 +81,7 @@ int ovl_can_decode_fh(struct super_block *sb) if (!capable(CAP_DAC_READ_SEARCH)) return 0; - if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry) + if (!exportfs_can_decode_fh(sb->s_export_op)) return 0; return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; @@ -575,6 +601,16 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } +/* + * Use this over ovl_is_whiteout for upper and lower files, as it also + * handles overlay.whiteout xattr whiteout files. + */ +bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path) +{ + return ovl_is_whiteout(path->dentry) || + ovl_path_check_xwhiteout_xattr(ofs, path); +} + struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); @@ -644,22 +680,36 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags) return false; } +/* + * The copy up "transaction" keeps an elevated mnt write count on upper mnt, + * but leaves taking freeze protection on upper sb to lower level helpers. + */ int ovl_copy_up_start(struct dentry *dentry, int flags) { struct inode *inode = d_inode(dentry); int err; err = ovl_inode_lock_interruptible(inode); - if (!err && ovl_already_copied_up_locked(dentry, flags)) { + if (err) + return err; + + if (ovl_already_copied_up_locked(dentry, flags)) err = 1; /* Already copied up */ - ovl_inode_unlock(inode); - } + else + err = ovl_get_write_access(dentry); + if (err) + goto out_unlock; + + return 0; +out_unlock: + ovl_inode_unlock(inode); return err; } void ovl_copy_up_end(struct dentry *dentry) { + ovl_put_write_access(dentry); ovl_inode_unlock(d_inode(dentry)); } @@ -676,6 +726,32 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path) return false; } +bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) +{ + struct dentry *dentry = path->dentry; + int res; + + /* xattr.whiteout must be a zero size regular file */ + if (!d_is_reg(dentry) || i_size_read(d_inode(dentry)) != 0) + return false; + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUT, NULL, 0); + return res >= 0; +} + +bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path) +{ + struct dentry *dentry = path->dentry; + int res; + + /* xattr.whiteouts must be a directory */ + if (!d_is_dir(dentry)) + return false; + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0); + return res >= 0; +} + /* * Load persistent uuid from xattr into s_uuid if found, or store a new * random generated value in s_uuid and in xattr. @@ -760,6 +836,8 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_UUID_POSTFIX "uuid" #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" +#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" +#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -775,6 +853,8 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID), OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, @@ -973,12 +1053,18 @@ static void ovl_cleanup_index(struct dentry *dentry) struct dentry *index = NULL; struct inode *inode; struct qstr name = { }; + bool got_write = false; int err; err = ovl_get_index_name(ofs, lowerdentry, &name); if (err) goto fail; + err = ovl_want_write(dentry); + if (err) + goto fail; + + got_write = true; inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", @@ -1016,6 +1102,8 @@ static void ovl_cleanup_index(struct dentry *dentry) goto fail; out: + if (got_write) + ovl_drop_write(dentry); kfree(name.name); dput(index); return; @@ -1062,8 +1150,12 @@ int ovl_nlink_start(struct dentry *dentry) if (err) return err; + err = ovl_want_write(dentry); + if (err) + goto out_unlock; + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) - goto out; + return 0; old_cred = ovl_override_creds(dentry->d_sb); /* @@ -1074,10 +1166,15 @@ int ovl_nlink_start(struct dentry *dentry) */ err = ovl_set_nlink_upper(dentry); revert_creds(old_cred); - -out: if (err) - ovl_inode_unlock(inode); + goto out_drop_write; + + return 0; + +out_drop_write: + ovl_drop_write(dentry); +out_unlock: + ovl_inode_unlock(inode); return err; } @@ -1086,6 +1183,8 @@ void ovl_nlink_end(struct dentry *dentry) { struct inode *inode = d_inode(dentry); + ovl_drop_write(dentry); + if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { const struct cred *old_cred; @@ -1403,6 +1502,7 @@ void ovl_copyattr(struct inode *inode) realinode = ovl_i_path_real(inode, &realpath); real_idmap = mnt_idmap(realpath.mnt); + spin_lock(&inode->i_lock); vfsuid = i_uid_into_vfsuid(real_idmap, realinode); vfsgid = i_gid_into_vfsgid(real_idmap, realinode); @@ -1413,4 +1513,5 @@ void ovl_copyattr(struct inode *inode) inode_set_mtime_to_ts(inode, inode_get_mtime(realinode)); inode_set_ctime_to_ts(inode, inode_get_ctime(realinode)); i_size_write(inode, i_size_read(realinode)); + spin_unlock(&inode->i_lock); } diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c new file mode 100644 index 000000000000..383978e4663c --- /dev/null +++ b/fs/overlayfs/xattrs.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/fs.h> +#include <linux/xattr.h> +#include "overlayfs.h" + +static bool ovl_is_escaped_xattr(struct super_block *sb, const char *name) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + if (ofs->config.userxattr) + return strncmp(name, OVL_XATTR_ESCAPE_USER_PREFIX, + OVL_XATTR_ESCAPE_USER_PREFIX_LEN) == 0; + else + return strncmp(name, OVL_XATTR_ESCAPE_TRUSTED_PREFIX, + OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN - 1) == 0; +} + +static bool ovl_is_own_xattr(struct super_block *sb, const char *name) +{ + struct ovl_fs *ofs = OVL_FS(sb); + + if (ofs->config.userxattr) + return strncmp(name, OVL_XATTR_USER_PREFIX, + OVL_XATTR_USER_PREFIX_LEN) == 0; + else + return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, + OVL_XATTR_TRUSTED_PREFIX_LEN) == 0; +} + +bool ovl_is_private_xattr(struct super_block *sb, const char *name) +{ + return ovl_is_own_xattr(sb, name) && !ovl_is_escaped_xattr(sb, name); +} + +static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + struct path realpath; + const struct cred *old_cred; + + if (!value && !upperdentry) { + ovl_path_lower(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); + revert_creds(old_cred); + if (err < 0) + goto out; + } + + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out; + + realdentry = ovl_dentry_upper(dentry); + } + + err = ovl_want_write(dentry); + if (err) + goto out; + + old_cred = ovl_override_creds(dentry->d_sb); + if (value) { + err = ovl_do_setxattr(ofs, realdentry, name, value, size, + flags); + } else { + WARN_ON(flags != XATTR_REPLACE); + err = ovl_do_removexattr(ofs, realdentry, name); + } + revert_creds(old_cred); + ovl_drop_write(dentry); + + /* copy c/mtime */ + ovl_copyattr(inode); +out: + return err; +} + +static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, + void *value, size_t size) +{ + ssize_t res; + const struct cred *old_cred; + struct path realpath; + + ovl_i_path_real(inode, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); + revert_creds(old_cred); + return res; +} + +static bool ovl_can_list(struct super_block *sb, const char *s) +{ + /* Never list private (.overlay) */ + if (ovl_is_private_xattr(sb, s)) + return false; + + /* List all non-trusted xattrs */ + if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + return true; + + /* list other trusted for superuser only */ + return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + struct dentry *realdentry = ovl_dentry_real(dentry); + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + ssize_t res; + size_t len; + char *s; + const struct cred *old_cred; + size_t prefix_len, name_len; + + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_listxattr(realdentry, list, size); + revert_creds(old_cred); + if (res <= 0 || size == 0) + return res; + + prefix_len = ofs->config.userxattr ? + OVL_XATTR_USER_PREFIX_LEN : OVL_XATTR_TRUSTED_PREFIX_LEN; + + /* filter out private xattrs */ + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + + len -= slen; + if (!ovl_can_list(dentry->d_sb, s)) { + res -= slen; + memmove(s, s + slen, len); + } else if (ovl_is_escaped_xattr(dentry->d_sb, s)) { + res -= OVL_XATTR_ESCAPE_PREFIX_LEN; + name_len = slen - prefix_len - OVL_XATTR_ESCAPE_PREFIX_LEN; + s += prefix_len; + memmove(s, s + OVL_XATTR_ESCAPE_PREFIX_LEN, name_len + len); + s += name_len; + } else { + s += slen; + } + } + + return res; +} + +static char *ovl_xattr_escape_name(const char *prefix, const char *name) +{ + size_t prefix_len = strlen(prefix); + size_t name_len = strlen(name); + size_t escaped_len; + char *escaped, *s; + + escaped_len = prefix_len + OVL_XATTR_ESCAPE_PREFIX_LEN + name_len; + if (escaped_len > XATTR_NAME_MAX) + return ERR_PTR(-EOPNOTSUPP); + + escaped = kmalloc(escaped_len + 1, GFP_KERNEL); + if (escaped == NULL) + return ERR_PTR(-ENOMEM); + + s = escaped; + memcpy(s, prefix, prefix_len); + s += prefix_len; + memcpy(s, OVL_XATTR_ESCAPE_PREFIX, OVL_XATTR_ESCAPE_PREFIX_LEN); + s += OVL_XATTR_ESCAPE_PREFIX_LEN; + memcpy(s, name, name_len + 1); + + return escaped; +} + +static int ovl_own_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + char *escaped; + int r; + + escaped = ovl_xattr_escape_name(handler->prefix, name); + if (IS_ERR(escaped)) + return PTR_ERR(escaped); + + r = ovl_xattr_get(dentry, inode, escaped, buffer, size); + + kfree(escaped); + + return r; +} + +static int ovl_own_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + char *escaped; + int r; + + escaped = ovl_xattr_escape_name(handler->prefix, name); + if (IS_ERR(escaped)) + return PTR_ERR(escaped); + + r = ovl_xattr_set(dentry, inode, escaped, value, size, flags); + + kfree(escaped); + + return r; +} + +static int ovl_other_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, inode, name, buffer, size); +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_xattr_set(dentry, inode, name, value, size, flags); +} + +static const struct xattr_handler ovl_own_trusted_xattr_handler = { + .prefix = OVL_XATTR_TRUSTED_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_own_user_xattr_handler = { + .prefix = OVL_XATTR_USER_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_other_xattr_handler = { + .prefix = "", /* catch all */ + .get = ovl_other_xattr_get, + .set = ovl_other_xattr_set, +}; + +static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = { + &ovl_own_trusted_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +static const struct xattr_handler * const ovl_user_xattr_handlers[] = { + &ovl_own_user_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs) +{ + return ofs->config.userxattr ? ovl_user_xattr_handlers : + ovl_trusted_xattr_handlers; +} + diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index fe1bf5b6e0cb..59f6b8e32cc9 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -32,7 +32,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, * fully cached or it may be in the process of * being deleted due to a lease break. */ - if (!cfid->has_lease) { + if (!cfid->time || !cfid->has_lease) { spin_unlock(&cfids->cfid_list_lock); return NULL; } @@ -193,10 +193,20 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, npath = path_no_prefix(cifs_sb, path); if (IS_ERR(npath)) { rc = PTR_ERR(npath); - kfree(utf16_path); - return rc; + goto out; } + if (!npath[0]) { + dentry = dget(cifs_sb->root); + } else { + dentry = path_to_dentry(cifs_sb, npath); + if (IS_ERR(dentry)) { + rc = -ENOENT; + goto out; + } + } + cfid->dentry = dentry; + /* * We do not hold the lock for the open because in case * SMB2_open needs to reconnect. @@ -249,6 +259,15 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); + /* + * Set @cfid->has_lease to true before sending out compounded request so + * its lease reference can be put in cached_dir_lease_break() due to a + * potential lease break right after the request is sent or while @cfid + * is still being cached. Concurrent processes won't be to use it yet + * due to @cfid->time being zero. + */ + cfid->has_lease = true; + rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); @@ -263,6 +282,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, cfid->tcon = tcon; cfid->is_open = true; + spin_lock(&cfids->cfid_list_lock); + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; @@ -270,18 +291,25 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ - if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) + rc = -EINVAL; + if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) { + spin_unlock(&cfids->cfid_list_lock); goto oshr_free; + } smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL, NULL); - if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) + if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) { + spin_unlock(&cfids->cfid_list_lock); goto oshr_free; + } qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; - if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) + if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) { + spin_unlock(&cfids->cfid_list_lock); goto oshr_free; + } if (!smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), sizeof(struct smb2_file_all_info), @@ -289,37 +317,24 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, (char *)&cfid->file_all_info)) cfid->file_all_info_is_valid = true; - if (!npath[0]) - dentry = dget(cifs_sb->root); - else { - dentry = path_to_dentry(cifs_sb, npath); - if (IS_ERR(dentry)) { - rc = -ENOENT; - goto oshr_free; - } - } - spin_lock(&cfids->cfid_list_lock); - cfid->dentry = dentry; cfid->time = jiffies; - cfid->has_lease = true; spin_unlock(&cfids->cfid_list_lock); + /* At this point the directory handle is fully cached */ + rc = 0; oshr_free: - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - spin_lock(&cfids->cfid_list_lock); - if (!cfid->has_lease) { - if (rc) { - if (cfid->on_list) { - list_del(&cfid->entry); - cfid->on_list = false; - cfids->num_entries--; - } - rc = -ENOENT; - } else { + if (rc) { + spin_lock(&cfids->cfid_list_lock); + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + } + if (cfid->has_lease) { /* * We are guaranteed to have two references at this * point. One for the caller and one for a potential @@ -327,25 +342,24 @@ oshr_free: * will be closed when the caller closes the cached * handle. */ + cfid->has_lease = false; spin_unlock(&cfids->cfid_list_lock); kref_put(&cfid->refcount, smb2_close_cached_fid); goto out; } + spin_unlock(&cfids->cfid_list_lock); } - spin_unlock(&cfids->cfid_list_lock); +out: if (rc) { if (cfid->is_open) SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); free_cached_dir(cfid); - cfid = NULL; - } -out: - if (rc == 0) { + } else { *ret_cfid = cfid; atomic_inc(&tcon->num_remote_opens); } - + kfree(utf16_path); return rc; } diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 76922fcc4bc6..6d8804fb6535 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -427,6 +427,8 @@ skip_rdma: if (server->nosharesock) seq_printf(m, " nosharesock"); + seq_printf(m, "\nServer capabilities: 0x%x", server->capabilities); + if (server->rdma) seq_printf(m, "\nRDMA "); seq_printf(m, "\nTCP status: %d Instance: %d" @@ -452,6 +454,11 @@ skip_rdma: seq_printf(m, "\n\n\tSessions: "); i = 0; list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); + continue; + } i++; if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || @@ -472,6 +479,7 @@ skip_rdma: ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->ses_status); } + spin_unlock(&ses->ses_lock); seq_printf(m, "\n\tSecurity type: %s ", get_security_type_str(server->ops->select_sectype(server, ses->sectype))); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 22869cda1356..ea3a7a668b45 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1191,6 +1191,7 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode, const struct inode_operations cifs_symlink_inode_ops = { .get_link = cifs_get_link, + .setattr = cifs_setattr, .permission = cifs_permission, .listxattr = cifs_listxattr, }; diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index e17222fec9d2..a75220db5c1e 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -2570,7 +2570,7 @@ typedef struct { struct win_dev { - unsigned char type[8]; /* IntxCHR or IntxBLK */ + unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/ __le64 major; __le64 minor; } __attribute__((packed)); diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 0c37eefa18a5..890ceddae07e 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -81,7 +81,7 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx, extern char *build_wildcard_path_from_dentry(struct dentry *direntry); char *cifs_build_devname(char *nodename, const char *prepath); extern void delete_mid(struct mid_q_entry *mid); -extern void release_mid(struct mid_q_entry *mid); +void __release_mid(struct kref *refcount); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); @@ -740,4 +740,9 @@ static inline bool dfs_src_pathname_equal(const char *s1, const char *s2) return true; } +static inline void release_mid(struct mid_q_entry *mid) +{ + kref_put(&mid->refcount, __release_mid); +} + #endif /* _CIFSPROTO_H */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 7b923e36501b..1a137b33858a 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -119,6 +119,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) static void smb2_query_server_interfaces(struct work_struct *work) { int rc; + int xid; struct cifs_tcon *tcon = container_of(work, struct cifs_tcon, query_interfaces.work); @@ -126,7 +127,10 @@ static void smb2_query_server_interfaces(struct work_struct *work) /* * query server network interfaces, in case they change */ - rc = SMB3_request_interfaces(0, tcon, false); + xid = get_xid(); + rc = SMB3_request_interfaces(xid, tcon, false); + free_xid(xid); + if (rc) { cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); @@ -156,13 +160,14 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, /* If server is a channel, select the primary channel */ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; - spin_lock(&pserver->srv_lock); + /* if we need to signal just this channel */ if (!all_channels) { - pserver->tcpStatus = CifsNeedReconnect; - spin_unlock(&pserver->srv_lock); + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsNeedReconnect; + spin_unlock(&server->srv_lock); return; } - spin_unlock(&pserver->srv_lock); spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { @@ -1969,9 +1974,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) void __cifs_put_smb_ses(struct cifs_ses *ses) { - unsigned int rc, xid; - unsigned int chan_count; struct TCP_Server_Info *server = ses->server; + unsigned int xid; + size_t i; + int rc; spin_lock(&ses->ses_lock); if (ses->ses_status == SES_EXITING) { @@ -2017,20 +2023,14 @@ void __cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); - chan_count = ses->chan_count; - /* close any extra channels */ - if (chan_count > 1) { - int i; - - for (i = 1; i < chan_count; i++) { - if (ses->chans[i].iface) { - kref_put(&ses->chans[i].iface->refcount, release_iface); - ses->chans[i].iface = NULL; - } - cifs_put_tcp_session(ses->chans[i].server, 0); - ses->chans[i].server = NULL; + for (i = 1; i < ses->chan_count; i++) { + if (ses->chans[i].iface) { + kref_put(&ses->chans[i].iface->refcount, release_iface); + ses->chans[i].iface = NULL; } + cifs_put_tcp_session(ses->chans[i].server, 0); + ses->chans[i].server = NULL; } sesInfoFree(ses); @@ -3849,8 +3849,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); spin_unlock(&ses->chan_lock); - if (!is_binding) + if (!is_binding) { ses->ses_status = SES_IN_SETUP; + + /* force iface_list refresh */ + ses->iface_last_update = 0; + } spin_unlock(&ses->ses_lock); /* update ses ip_addr only for primary chan */ diff --git a/fs/smb/client/export.c b/fs/smb/client/export.c index 37c28415df1e..d606e8cbcb7d 100644 --- a/fs/smb/client/export.c +++ b/fs/smb/client/export.c @@ -41,13 +41,12 @@ static struct dentry *cifs_get_parent(struct dentry *dentry) } const struct export_operations cifs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .get_parent = cifs_get_parent, -/* Following five export operations are unneeded so far and can default: - .get_dentry = - .get_name = - .find_exported_dentry = - .decode_fh = - .encode_fs = */ +/* + * Following export operations are mandatory for NFS export support: + * .fh_to_dentry = + */ }; #endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 3abfe77bfa46..86fbd3f847d6 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -594,6 +594,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; + } else if (memcmp("LnxFIFO", pbuf, 8) == 0) { + cifs_dbg(FYI, "FIFO\n"); + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; } else { fattr->cf_mode |= S_IFREG; /* file? */ fattr->cf_dtype = DT_REG; diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index c66be4904e1f..a1da50e66fbb 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -42,23 +42,11 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) rc = cifs_alloc_hash("md5", &md5); if (rc) - goto symlink_hash_err; + return rc; - rc = crypto_shash_init(md5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); - goto symlink_hash_err; - } - rc = crypto_shash_update(md5, link_str, link_len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); - goto symlink_hash_err; - } - rc = crypto_shash_final(md5, md5_hash); + rc = crypto_shash_digest(md5, link_str, link_len, md5_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); - -symlink_hash_err: cifs_free_hash(&md5); return rc; } diff --git a/fs/smb/client/ntlmssp.h b/fs/smb/client/ntlmssp.h index 2c5dde2ece58..875de43b72de 100644 --- a/fs/smb/client/ntlmssp.h +++ b/fs/smb/client/ntlmssp.h @@ -133,8 +133,8 @@ typedef struct _AUTHENTICATE_MESSAGE { SECURITY_BUFFER WorkstationName; SECURITY_BUFFER SessionKey; __le32 NegotiateFlags; - /* SECURITY_BUFFER for version info not present since we - do not set the version is present flag */ + struct ntlmssp_version Version; + /* SECURITY_BUFFER */ char UserString[]; } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 79f26c560edf..cd474cf98f30 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -186,7 +186,6 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) } if (!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - ses->chan_max = 1; spin_unlock(&ses->chan_lock); cifs_server_dbg(VFS, "no multichannel support\n"); return 0; @@ -1060,10 +1059,16 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; + /* send version information in ntlmssp authenticate also */ flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET | - NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED; - /* we only send version information in ntlmssp negotiate, so do not set this flag */ - flags = flags & ~NTLMSSP_NEGOTIATE_VERSION; + NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_VERSION | + NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED; + + sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR; + sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL; + sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD); + sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3; + tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 25f7cd6f23d6..32dfa0f7a78c 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -787,7 +787,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid, { struct close_cancelled_open *cancelled; - cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC); + cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); if (!cancelled) return -ENOMEM; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index f4849a8ad40b..601e7a187f87 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5089,7 +5089,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions */ - if (!S_ISCHR(mode) && !S_ISBLK(mode)) + if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode)) return rc; cifs_dbg(FYI, "sfu compat create special file\n"); @@ -5137,6 +5137,12 @@ smb2_make_node(unsigned int xid, struct inode *inode, pdev->minor = cpu_to_le64(MINOR(dev)); rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); + } else if (S_ISFIFO(mode)) { + memcpy(pdev->type, "LnxFIFO", 8); + pdev->major = 0; + pdev->minor = 0; + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); } tcon->ses->server->ops->close(xid, tcon, &fid); d_drop(dentry); diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 14710afdc2a3..d553b7a54621 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -76,7 +76,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) return temp; } -static void __release_mid(struct kref *refcount) +void __release_mid(struct kref *refcount) { struct mid_q_entry *midEntry = container_of(refcount, struct mid_q_entry, refcount); @@ -156,15 +156,6 @@ static void __release_mid(struct kref *refcount) mempool_free(midEntry, cifs_mid_poolp); } -void release_mid(struct mid_q_entry *mid) -{ - struct TCP_Server_Info *server = mid->server; - - spin_lock(&server->mid_lock); - kref_put(&mid->refcount, __release_mid); - spin_unlock(&server->mid_lock); -} - void delete_mid(struct mid_q_entry *mid) { diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 319fb9ffc6a0..8983f45f8430 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -34,6 +34,7 @@ #define SMB2_QUERY_INFO_HE 0x0010 #define SMB2_SET_INFO_HE 0x0011 #define SMB2_OPLOCK_BREAK_HE 0x0012 +#define SMB2_SERVER_TO_CLIENT_NOTIFICATION 0x0013 /* The same list in little endian */ #define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) @@ -411,6 +412,7 @@ struct smb2_tree_disconnect_rsp { #define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ #define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ #define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_NOTIFICATIONS 0x00000080 /* New to SMB3.1.1 */ /* Internal types */ #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 @@ -981,6 +983,19 @@ struct smb2_change_notify_rsp { __u8 Buffer[]; /* array of file notify structs */ } __packed; +/* + * SMB2_SERVER_TO_CLIENT_NOTIFICATION: See MS-SMB2 section 2.2.44 + */ + +#define SMB2_NOTIFY_SESSION_CLOSED 0x0000 + +struct smb2_server_client_notification { + struct smb2_hdr hdr; + __le16 StructureSize; + __u16 Reserved; /* MBZ */ + __le32 NotificationType; + __u8 NotificationBuffer[4]; /* MBZ */ +} __packed; /* * SMB2_CREATE See MS-SMB2 section 2.2.13 @@ -1097,16 +1112,23 @@ struct smb2_change_notify_rsp { #define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) #define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) #define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008) +/* FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) should be zero, ignored */ +/* FILE_SYNCHRONOUS_IO_NONALERT cpu_to_le32(0x00000020) should be zero, ignored */ #define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) #define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) #define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) +/* FILE_OPEN_REMOTE_INSTANCE cpu_to_le32(0x00000400) should be zero, ignored */ #define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) -#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) +#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) /* MBZ */ #define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) #define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) #define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) +/* FILE_OPEN_REQUIRING_OPLOCK cpu_to_le32(0x00010000) should be zero, ignored */ +/* FILE_DISALLOW_EXCLUSIVE cpu_to_le32(0x00020000) should be zero, ignored */ +/* FILE_RESERVE_OPFILTER cpu_to_le32(0x00100000) MBZ */ #define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) #define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) +/* #define FILE_OPEN_FOR_FREE_SPACE_QUERY cpu_to_le32(0x00800000) should be zero, ignored */ #define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF) #define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 723763746238..62972f0ff868 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -173,6 +173,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, const struct export_operations squashfs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = squashfs_fh_to_dentry, .fh_to_parent = squashfs_fh_to_parent, .get_parent = squashfs_get_parent diff --git a/fs/super.c b/fs/super.c index 77faad662739..076392396e72 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2160,3 +2160,4 @@ int sb_init_dio_done_wq(struct super_block *sb) destroy_workqueue(wq); return 0; } +EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index a12ac0356c69..6b7652fb8050 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -167,6 +167,18 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, return battr->mmap(of->file, kobj, battr, vma); } +static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset, + int whence) +{ + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + + if (battr->llseek) + return battr->llseek(of->file, kobj, battr, offset, whence); + else + return generic_file_llseek(of->file, offset, whence); +} + static int sysfs_kf_bin_open(struct kernfs_open_file *of) { struct bin_attribute *battr = of->kn->priv; @@ -249,6 +261,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = { .write = sysfs_kf_bin_write, .mmap = sysfs_kf_bin_mmap, .open = sysfs_kf_bin_open, + .llseek = sysfs_kf_bin_llseek, }; int sysfs_add_file_mode_ns(struct kernfs_node *parent, diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 8c8d64e76103..f8a594a50ae6 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -2,8 +2,9 @@ /* * event_inode.c - part of tracefs, a pseudo file system for activating tracing * - * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org> + * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt <rostedt@goodmis.org> * Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com> + * Copyright (C) 2023 Google, author: Steven Rostedt <rostedt@goodmis.org> * * eventfs is used to dynamically create inodes and dentries based on the * meta data provided by the tracing system. @@ -23,48 +24,30 @@ #include <linux/delay.h> #include "internal.h" -struct eventfs_inode { - struct list_head e_top_files; -}; +/* + * eventfs_mutex protects the eventfs_inode (ei) dentry. Any access + * to the ei->dentry must be done under this mutex and after checking + * if ei->is_freed is not set. The ei->dentry is released under the + * mutex at the same time ei->is_freed is set. If ei->is_freed is set + * then the ei->dentry is invalid. + */ +static DEFINE_MUTEX(eventfs_mutex); /* - * struct eventfs_file - hold the properties of the eventfs files and - * directories. - * @name: the name of the file or directory to create - * @d_parent: holds parent's dentry - * @dentry: once accessed holds dentry - * @list: file or directory to be added to parent directory - * @ei: list of files and directories within directory - * @fop: file_operations for file or directory - * @iop: inode_operations for file or directory - * @data: something that the caller will want to get to later on - * @mode: the permission that the file or directory should have + * The eventfs_inode (ei) itself is protected by SRCU. It is released from + * its parent's list and will have is_freed set (under eventfs_mutex). + * After the SRCU grace period is over, the ei may be freed. */ -struct eventfs_file { - const char *name; - struct dentry *d_parent; - struct dentry *dentry; - struct list_head list; - struct eventfs_inode *ei; - const struct file_operations *fop; - const struct inode_operations *iop; - /* - * Union - used for deletion - * @del_list: list of eventfs_file to delete - * @rcu: eventfs_file to delete in RCU - * @is_freed: node is freed if one of the above is set - */ - union { - struct list_head del_list; - struct rcu_head rcu; - unsigned long is_freed; - }; - void *data; - umode_t mode; +DEFINE_STATIC_SRCU(eventfs_srcu); + +/* Mode is unsigned short, use the upper bits for flags */ +enum { + EVENTFS_SAVE_MODE = BIT(16), + EVENTFS_SAVE_UID = BIT(17), + EVENTFS_SAVE_GID = BIT(18), }; -static DEFINE_MUTEX(eventfs_mutex); -DEFINE_STATIC_SRCU(eventfs_srcu); +#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, @@ -73,8 +56,88 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); static int eventfs_release(struct inode *inode, struct file *file); +static void update_attr(struct eventfs_attr *attr, struct iattr *iattr) +{ + unsigned int ia_valid = iattr->ia_valid; + + if (ia_valid & ATTR_MODE) { + attr->mode = (attr->mode & ~EVENTFS_MODE_MASK) | + (iattr->ia_mode & EVENTFS_MODE_MASK) | + EVENTFS_SAVE_MODE; + } + if (ia_valid & ATTR_UID) { + attr->mode |= EVENTFS_SAVE_UID; + attr->uid = iattr->ia_uid; + } + if (ia_valid & ATTR_GID) { + attr->mode |= EVENTFS_SAVE_GID; + attr->gid = iattr->ia_gid; + } +} + +static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *iattr) +{ + const struct eventfs_entry *entry; + struct eventfs_inode *ei; + const char *name; + int ret; + + mutex_lock(&eventfs_mutex); + ei = dentry->d_fsdata; + if (ei->is_freed) { + /* Do not allow changes if the event is about to be removed. */ + mutex_unlock(&eventfs_mutex); + return -ENODEV; + } + + /* Preallocate the children mode array if necessary */ + if (!(dentry->d_inode->i_mode & S_IFDIR)) { + if (!ei->entry_attrs) { + ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries, + GFP_KERNEL); + if (!ei->entry_attrs) { + ret = -ENOMEM; + goto out; + } + } + } + + ret = simple_setattr(idmap, dentry, iattr); + if (ret < 0) + goto out; + + /* + * If this is a dir, then update the ei cache, only the file + * mode is saved in the ei->m_children, and the ownership is + * determined by the parent directory. + */ + if (dentry->d_inode->i_mode & S_IFDIR) { + update_attr(&ei->attr, iattr); + + } else { + name = dentry->d_name.name; + + for (int i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (strcmp(name, entry->name) == 0) { + update_attr(&ei->entry_attrs[i], iattr); + break; + } + } + } + out: + mutex_unlock(&eventfs_mutex); + return ret; +} + static const struct inode_operations eventfs_root_dir_inode_operations = { .lookup = eventfs_root_lookup, + .setattr = eventfs_set_attr, +}; + +static const struct inode_operations eventfs_file_inode_operations = { + .setattr = eventfs_set_attr, }; static const struct file_operations eventfs_file_operations = { @@ -85,26 +148,40 @@ static const struct file_operations eventfs_file_operations = { .release = eventfs_release, }; +static void update_inode_attr(struct inode *inode, struct eventfs_attr *attr, umode_t mode) +{ + if (!attr) { + inode->i_mode = mode; + return; + } + + if (attr->mode & EVENTFS_SAVE_MODE) + inode->i_mode = attr->mode & EVENTFS_MODE_MASK; + else + inode->i_mode = mode; + + if (attr->mode & EVENTFS_SAVE_UID) + inode->i_uid = attr->uid; + + if (attr->mode & EVENTFS_SAVE_GID) + inode->i_gid = attr->gid; +} + /** * create_file - create a file in the tracefs filesystem * @name: the name of the file to create. * @mode: the permission that the file should have. + * @attr: saved attributes changed by user * @parent: parent dentry for this file. * @data: something that the caller will want to get to later on. * @fop: struct file_operations that should be used for this file. * - * This is the basic "create a file" function for tracefs. It allows for a - * wide range of flexibility in creating a file. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the tracefs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. - * - * If tracefs is not enabled in the kernel, the value -%ENODEV will be - * returned. + * This function creates a dentry that represents a file in the eventsfs_inode + * directory. The inode.i_private pointer will point to @data in the open() + * call. */ static struct dentry *create_file(const char *name, umode_t mode, + struct eventfs_attr *attr, struct dentry *parent, void *data, const struct file_operations *fop) { @@ -118,6 +195,7 @@ static struct dentry *create_file(const char *name, umode_t mode, if (WARN_ON_ONCE(!S_ISREG(mode))) return NULL; + WARN_ON_ONCE(!parent); dentry = eventfs_start_creating(name, parent); if (IS_ERR(dentry)) @@ -127,7 +205,10 @@ static struct dentry *create_file(const char *name, umode_t mode, if (unlikely(!inode)) return eventfs_failed_creating(dentry); - inode->i_mode = mode; + /* If the user updated the directory's attributes, use them */ + update_inode_attr(inode, attr, mode); + + inode->i_op = &eventfs_file_inode_operations; inode->i_fop = fop; inode->i_private = data; @@ -140,28 +221,19 @@ static struct dentry *create_file(const char *name, umode_t mode, /** * create_dir - create a dir in the tracefs filesystem - * @name: the name of the file to create. + * @ei: the eventfs_inode that represents the directory to create * @parent: parent dentry for this file. - * @data: something that the caller will want to get to later on. * - * This is the basic "create a dir" function for eventfs. It allows for a - * wide range of flexibility in creating a dir. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the tracefs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %NULL will be returned. - * - * If tracefs is not enabled in the kernel, the value -%ENODEV will be - * returned. + * This function will create a dentry for a directory represented by + * a eventfs_inode. */ -static struct dentry *create_dir(const char *name, struct dentry *parent, void *data) +static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent) { struct tracefs_inode *ti; struct dentry *dentry; struct inode *inode; - dentry = eventfs_start_creating(name, parent); + dentry = eventfs_start_creating(ei->name, parent); if (IS_ERR(dentry)) return dentry; @@ -169,10 +241,11 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void * if (unlikely(!inode)) return eventfs_failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + /* If the user updated the directory's attributes, use them */ + update_inode_attr(inode, &ei->attr, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; - inode->i_private = data; ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE; @@ -184,117 +257,198 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void * return eventfs_end_creating(dentry); } +static void free_ei(struct eventfs_inode *ei) +{ + kfree_const(ei->name); + kfree(ei->d_children); + kfree(ei->entry_attrs); + kfree(ei); +} + /** - * eventfs_set_ef_status_free - set the ef->status to free + * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode * @ti: the tracefs_inode of the dentry - * @dentry: dentry who's status to be freed + * @dentry: dentry which has the reference to remove. * - * eventfs_set_ef_status_free will be called if no more - * references remain + * Remove the association between a dentry from an eventfs_inode. */ -void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) +void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) { - struct tracefs_inode *ti_parent; struct eventfs_inode *ei; - struct eventfs_file *ef, *tmp; - - /* The top level events directory may be freed by this */ - if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { - LIST_HEAD(ef_del_list); + int i; - mutex_lock(&eventfs_mutex); + mutex_lock(&eventfs_mutex); - ei = ti->private; + ei = dentry->d_fsdata; + if (!ei) + goto out; - /* Record all the top level files */ - list_for_each_entry_srcu(ef, &ei->e_top_files, list, - lockdep_is_held(&eventfs_mutex)) { - list_add_tail(&ef->del_list, &ef_del_list); + /* This could belong to one of the files of the ei */ + if (ei->dentry != dentry) { + for (i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i] == dentry) + break; } + if (WARN_ON_ONCE(i == ei->nr_entries)) + goto out; + ei->d_children[i] = NULL; + } else if (ei->is_freed) { + free_ei(ei); + } else { + ei->dentry = NULL; + } + + dentry->d_fsdata = NULL; + out: + mutex_unlock(&eventfs_mutex); +} - /* Nothing should access this, but just in case! */ - ti->private = NULL; +/** + * create_file_dentry - create a dentry for a file of an eventfs_inode + * @ei: the eventfs_inode that the file will be created under + * @idx: the index into the d_children[] of the @ei + * @parent: The parent dentry of the created file. + * @name: The name of the file to create + * @mode: The mode of the file. + * @data: The data to use to set the inode of the file with on open() + * @fops: The fops of the file to be created. + * @lookup: If called by the lookup routine, in which case, dput() the created dentry. + * + * Create a dentry for a file of an eventfs_inode @ei and place it into the + * address located at @e_dentry. If the @e_dentry already has a dentry, then + * just do a dget() on it and return. Otherwise create the dentry and attach it. + */ +static struct dentry * +create_file_dentry(struct eventfs_inode *ei, int idx, + struct dentry *parent, const char *name, umode_t mode, void *data, + const struct file_operations *fops, bool lookup) +{ + struct eventfs_attr *attr = NULL; + struct dentry **e_dentry = &ei->d_children[idx]; + struct dentry *dentry; + bool invalidate = false; + mutex_lock(&eventfs_mutex); + if (ei->is_freed) { mutex_unlock(&eventfs_mutex); + return NULL; + } + /* If the e_dentry already has a dentry, use it */ + if (*e_dentry) { + /* lookup does not need to up the ref count */ + if (!lookup) + dget(*e_dentry); + mutex_unlock(&eventfs_mutex); + return *e_dentry; + } - /* Now safely free the top level files and their children */ - list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { - list_del(&ef->del_list); - eventfs_remove(ef); - } + /* ei->entry_attrs are protected by SRCU */ + if (ei->entry_attrs) + attr = &ei->entry_attrs[idx]; - kfree(ei); - return; - } + mutex_unlock(&eventfs_mutex); - mutex_lock(&eventfs_mutex); + /* The lookup already has the parent->d_inode locked */ + if (!lookup) + inode_lock(parent->d_inode); - ti_parent = get_tracefs(dentry->d_parent->d_inode); - if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) - goto out; + dentry = create_file(name, mode, attr, parent, data, fops); - ef = dentry->d_fsdata; - if (!ef) - goto out; + if (!lookup) + inode_unlock(parent->d_inode); - /* - * If ef was freed, then the LSB bit is set for d_fsdata. - * But this should not happen, as it should still have a - * ref count that prevents it. Warn in case it does. - */ - if (WARN_ON_ONCE((unsigned long)ef & 1)) - goto out; + mutex_lock(&eventfs_mutex); - dentry->d_fsdata = NULL; - ef->dentry = NULL; -out: + if (IS_ERR_OR_NULL(dentry)) { + /* + * When the mutex was released, something else could have + * created the dentry for this e_dentry. In which case + * use that one. + * + * Note, with the mutex held, the e_dentry cannot have content + * and the ei->is_freed be true at the same time. + */ + dentry = *e_dentry; + if (WARN_ON_ONCE(dentry && ei->is_freed)) + dentry = NULL; + /* The lookup does not need to up the dentry refcount */ + if (dentry && !lookup) + dget(dentry); + mutex_unlock(&eventfs_mutex); + return dentry; + } + + if (!*e_dentry && !ei->is_freed) { + *e_dentry = dentry; + dentry->d_fsdata = ei; + } else { + /* + * Should never happen unless we get here due to being freed. + * Otherwise it means two dentries exist with the same name. + */ + WARN_ON_ONCE(!ei->is_freed); + invalidate = true; + } mutex_unlock(&eventfs_mutex); + + if (invalidate) + d_invalidate(dentry); + + if (lookup || invalidate) + dput(dentry); + + return invalidate ? NULL : dentry; } /** * eventfs_post_create_dir - post create dir routine - * @ef: eventfs_file of recently created dir + * @ei: eventfs_inode of recently created dir * * Map the meta-data of files within an eventfs dir to their parent dentry */ -static void eventfs_post_create_dir(struct eventfs_file *ef) +static void eventfs_post_create_dir(struct eventfs_inode *ei) { - struct eventfs_file *ef_child; + struct eventfs_inode *ei_child; struct tracefs_inode *ti; + lockdep_assert_held(&eventfs_mutex); + /* srcu lock already held */ /* fill parent-child relation */ - list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list, + list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - ef_child->d_parent = ef->dentry; + ei_child->d_parent = ei->dentry; } - ti = get_tracefs(ef->dentry->d_inode); - ti->private = ef->ei; + ti = get_tracefs(ei->dentry->d_inode); + ti->private = ei; } /** - * create_dentry - helper function to create dentry - * @ef: eventfs_file of file or directory to create - * @parent: parent dentry - * @lookup: true if called from lookup routine + * create_dir_dentry - Create a directory dentry for the eventfs_inode + * @pei: The eventfs_inode parent of ei. + * @ei: The eventfs_inode to create the directory for + * @parent: The dentry of the parent of this directory + * @lookup: True if this is called by the lookup code * - * Used to create a dentry for file/dir, executes post dentry creation routine + * This creates and attaches a directory dentry to the eventfs_inode @ei. */ static struct dentry * -create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup) +create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, + struct dentry *parent, bool lookup) { bool invalidate = false; - struct dentry *dentry; + struct dentry *dentry = NULL; mutex_lock(&eventfs_mutex); - if (ef->is_freed) { + if (pei->is_freed || ei->is_freed) { mutex_unlock(&eventfs_mutex); return NULL; } - if (ef->dentry) { - dentry = ef->dentry; - /* On dir open, up the ref count */ + if (ei->dentry) { + /* If the dentry already has a dentry, use it */ + dentry = ei->dentry; + /* lookup does not need to up the ref count */ if (!lookup) dget(dentry); mutex_unlock(&eventfs_mutex); @@ -302,42 +456,44 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup) } mutex_unlock(&eventfs_mutex); + /* The lookup already has the parent->d_inode locked */ if (!lookup) inode_lock(parent->d_inode); - if (ef->ei) - dentry = create_dir(ef->name, parent, ef->data); - else - dentry = create_file(ef->name, ef->mode, parent, - ef->data, ef->fop); + dentry = create_dir(ei, parent); if (!lookup) inode_unlock(parent->d_inode); mutex_lock(&eventfs_mutex); - if (IS_ERR_OR_NULL(dentry)) { - /* If the ef was already updated get it */ - dentry = ef->dentry; + + if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { + /* + * When the mutex was released, something else could have + * created the dentry for this e_dentry. In which case + * use that one. + * + * Note, with the mutex held, the e_dentry cannot have content + * and the ei->is_freed be true at the same time. + */ + dentry = ei->dentry; if (dentry && !lookup) dget(dentry); mutex_unlock(&eventfs_mutex); return dentry; } - if (!ef->dentry && !ef->is_freed) { - ef->dentry = dentry; - if (ef->ei) - eventfs_post_create_dir(ef); - dentry->d_fsdata = ef; + if (!ei->dentry && !ei->is_freed) { + ei->dentry = dentry; + eventfs_post_create_dir(ei); + dentry->d_fsdata = ei; } else { - /* A race here, should try again (unless freed) */ - invalidate = true; - /* * Should never happen unless we get here due to being freed. * Otherwise it means two dentries exist with the same name. */ - WARN_ON_ONCE(!ef->is_freed); + WARN_ON_ONCE(!ei->is_freed); + invalidate = true; } mutex_unlock(&eventfs_mutex); if (invalidate) @@ -349,50 +505,90 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup) return invalidate ? NULL : dentry; } -static bool match_event_file(struct eventfs_file *ef, const char *name) -{ - bool ret; - - mutex_lock(&eventfs_mutex); - ret = !ef->is_freed && strcmp(ef->name, name) == 0; - mutex_unlock(&eventfs_mutex); - - return ret; -} - /** * eventfs_root_lookup - lookup routine to create file/dir * @dir: in which a lookup is being done * @dentry: file/dir dentry - * @flags: to pass as flags parameter to simple lookup + * @flags: Just passed to simple_lookup() * - * Used to create a dynamic file/dir within @dir. Use the eventfs_inode - * list of meta data to find the information needed to create the file/dir. + * Used to create dynamic file/dir with-in @dir, search with-in @ei + * list, if @dentry found go ahead and create the file/dir */ + static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { + const struct file_operations *fops; + const struct eventfs_entry *entry; + struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct eventfs_file *ef; + struct dentry *ei_dentry = NULL; struct dentry *ret = NULL; + const char *name = dentry->d_name.name; + bool created = false; + umode_t mode; + void *data; int idx; + int i; + int r; ti = get_tracefs(dir); if (!(ti->flags & TRACEFS_EVENT_INODE)) return NULL; - ei = ti->private; + /* Grab srcu to prevent the ei from going away */ idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_srcu(ef, &ei->e_top_files, list, + + /* + * Grab the eventfs_mutex to consistent value from ti->private. + * This s + */ + mutex_lock(&eventfs_mutex); + ei = READ_ONCE(ti->private); + if (ei && !ei->is_freed) + ei_dentry = READ_ONCE(ei->dentry); + mutex_unlock(&eventfs_mutex); + + if (!ei || !ei_dentry) + goto out; + + data = ei->data; + + list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - if (!match_event_file(ef, dentry->d_name.name)) + if (strcmp(ei_child->name, name) != 0) continue; ret = simple_lookup(dir, dentry, flags); - create_dentry(ef, ef->d_parent, true); + create_dir_dentry(ei, ei_child, ei_dentry, true); + created = true; break; } + + if (created) + goto out; + + for (i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (strcmp(name, entry->name) == 0) { + void *cdata = data; + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); + if (r <= 0) + continue; + ret = simple_lookup(dir, dentry, flags); + create_file_dentry(ei, i, ei_dentry, name, mode, cdata, + fops, true); + break; + } + } + out: srcu_read_unlock(&eventfs_srcu, idx); return ret; } @@ -432,29 +628,48 @@ static int eventfs_release(struct inode *inode, struct file *file) return dcache_dir_close(inode, file); } +static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt) +{ + struct dentry **tmp; + + tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); + if (!tmp) + return -1; + tmp[cnt] = d; + tmp[cnt + 1] = NULL; + *dentries = tmp; + return 0; +} + /** * dcache_dir_open_wrapper - eventfs open wrapper * @inode: not used - * @file: dir to be opened (to create its child) + * @file: dir to be opened (to create it's children) * - * Used to dynamically create the file/dir within @file. @file is really a - * directory and all the files/dirs of the children within @file will be - * created. If any of the files/dirs have already been created, their - * reference count will be incremented. + * Used to dynamic create file/dir with-in @file, all the + * file/dir will be created. If already created then references + * will be increased */ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) { + const struct file_operations *fops; + const struct eventfs_entry *entry; + struct eventfs_inode *ei_child; struct tracefs_inode *ti; struct eventfs_inode *ei; - struct eventfs_file *ef; struct dentry_list *dlist; struct dentry **dentries = NULL; - struct dentry *dentry = file_dentry(file); + struct dentry *parent = file_dentry(file); struct dentry *d; struct inode *f_inode = file_inode(file); + const char *name = parent->d_name.name; + umode_t mode; + void *data; int cnt = 0; int idx; int ret; + int i; + int r; ti = get_tracefs(f_inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) @@ -463,25 +678,56 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) if (WARN_ON_ONCE(file->private_data)) return -EINVAL; + idx = srcu_read_lock(&eventfs_srcu); + + mutex_lock(&eventfs_mutex); + ei = READ_ONCE(ti->private); + mutex_unlock(&eventfs_mutex); + + if (!ei) { + srcu_read_unlock(&eventfs_srcu, idx); + return -EINVAL; + } + + + data = ei->data; + dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); - if (!dlist) + if (!dlist) { + srcu_read_unlock(&eventfs_srcu, idx); return -ENOMEM; + } - ei = ti->private; - idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_srcu(ef, &ei->e_top_files, list, + list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { - d = create_dentry(ef, dentry, false); + d = create_dir_dentry(ei, ei_child, parent, false); if (d) { - struct dentry **tmp; + ret = add_dentries(&dentries, d, cnt); + if (ret < 0) + break; + cnt++; + } + } - tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); - if (!tmp) + for (i = 0; i < ei->nr_entries; i++) { + void *cdata = data; + entry = &ei->entries[i]; + name = entry->name; + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); + if (r <= 0) + continue; + d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); + if (d) { + ret = add_dentries(&dentries, d, cnt); + if (ret < 0) break; - tmp[cnt] = d; - tmp[cnt + 1] = NULL; cnt++; - dentries = tmp; } } srcu_read_unlock(&eventfs_srcu, idx); @@ -514,63 +760,104 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) } /** - * eventfs_prepare_ef - helper function to prepare eventfs_file - * @name: the name of the file/directory to create. - * @mode: the permission that the file should have. - * @fop: struct file_operations that should be used for this file/directory. - * @iop: struct inode_operations that should be used for this file/directory. - * @data: something that the caller will want to get to later on. The - * inode.i_private pointer will point to this value on the open() call. + * eventfs_create_dir - Create the eventfs_inode for this directory + * @name: The name of the directory to create. + * @parent: The eventfs_inode of the parent directory. + * @entries: A list of entries that represent the files under this directory + * @size: The number of @entries + * @data: The default data to pass to the files (an entry may override it). + * + * This function creates the descriptor to represent a directory in the + * eventfs. This descriptor is an eventfs_inode, and it is returned to be + * used to create other children underneath. + * + * The @entries is an array of eventfs_entry structures which has: + * const char *name + * eventfs_callback callback; * - * This function allocates and fills the eventfs_file structure. + * The name is the name of the file, and the callback is a pointer to a function + * that will be called when the file is reference (either by lookup or by + * reading a directory). The callback is of the prototype: + * + * int callback(const char *name, umode_t *mode, void **data, + * const struct file_operations **fops); + * + * When a file needs to be created, this callback will be called with + * name = the name of the file being created (so that the same callback + * may be used for multiple files). + * mode = a place to set the file's mode + * data = A pointer to @data, and the callback may replace it, which will + * cause the file created to pass the new data to the open() call. + * fops = the fops to use for the created file. + * + * NB. @callback is called while holding internal locks of the eventfs + * system. The callback must not call any code that might also call into + * the tracefs or eventfs system or it will risk creating a deadlock. */ -static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode, - const struct file_operations *fop, - const struct inode_operations *iop, - void *data) +struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, + const struct eventfs_entry *entries, + int size, void *data) { - struct eventfs_file *ef; + struct eventfs_inode *ei; - ef = kzalloc(sizeof(*ef), GFP_KERNEL); - if (!ef) + if (!parent) + return ERR_PTR(-EINVAL); + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) return ERR_PTR(-ENOMEM); - ef->name = kstrdup(name, GFP_KERNEL); - if (!ef->name) { - kfree(ef); + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) { + kfree(ei); return ERR_PTR(-ENOMEM); } - if (S_ISDIR(mode)) { - ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL); - if (!ef->ei) { - kfree(ef->name); - kfree(ef); + if (size) { + ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + if (!ei->d_children) { + kfree_const(ei->name); + kfree(ei); return ERR_PTR(-ENOMEM); } - INIT_LIST_HEAD(&ef->ei->e_top_files); - } else { - ef->ei = NULL; } - ef->iop = iop; - ef->fop = fop; - ef->mode = mode; - ef->data = data; - return ef; + ei->entries = entries; + ei->nr_entries = size; + ei->data = data; + INIT_LIST_HEAD(&ei->children); + INIT_LIST_HEAD(&ei->list); + + mutex_lock(&eventfs_mutex); + if (!parent->is_freed) { + list_add_tail(&ei->list, &parent->children); + ei->d_parent = parent->dentry; + } + mutex_unlock(&eventfs_mutex); + + /* Was the parent freed? */ + if (list_empty(&ei->list)) { + free_ei(ei); + ei = NULL; + } + return ei; } /** - * eventfs_create_events_dir - create the trace event structure - * @name: the name of the directory to create. - * @parent: parent dentry for this file. This should be a directory dentry - * if set. If this parameter is NULL, then the directory will be - * created in the root of the tracefs filesystem. + * eventfs_create_events_dir - create the top level events directory + * @name: The name of the top level directory to create. + * @parent: Parent dentry for this file in the tracefs directory. + * @entries: A list of entries that represent the files under this directory + * @size: The number of @entries + * @data: The default data to pass to the files (an entry may override it). * * This function creates the top of the trace event directory. + * + * See eventfs_create_dir() for use of @entries. */ -struct dentry *eventfs_create_events_dir(const char *name, - struct dentry *parent) +struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, + const struct eventfs_entry *entries, + int size, void *data) { struct dentry *dentry = tracefs_start_creating(name, parent); struct eventfs_inode *ei; @@ -581,19 +868,32 @@ struct dentry *eventfs_create_events_dir(const char *name, return NULL; if (IS_ERR(dentry)) - return dentry; + return ERR_CAST(dentry); ei = kzalloc(sizeof(*ei), GFP_KERNEL); if (!ei) - return ERR_PTR(-ENOMEM); + goto fail_ei; + inode = tracefs_get_inode(dentry->d_sb); - if (unlikely(!inode)) { - kfree(ei); - tracefs_failed_creating(dentry); - return ERR_PTR(-ENOMEM); + if (unlikely(!inode)) + goto fail; + + if (size) { + ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + if (!ei->d_children) + goto fail; } - INIT_LIST_HEAD(&ei->e_top_files); + ei->dentry = dentry; + ei->entries = entries; + ei->nr_entries = size; + ei->data = data; + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) + goto fail; + + INIT_LIST_HEAD(&ei->children); + INIT_LIST_HEAD(&ei->list); ti = get_tracefs(inode); ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; @@ -603,198 +903,97 @@ struct dentry *eventfs_create_events_dir(const char *name, inode->i_op = &eventfs_root_dir_inode_operations; inode->i_fop = &eventfs_file_operations; + dentry->d_fsdata = ei; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(dentry->d_parent->d_inode); fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return tracefs_end_creating(dentry); -} - -/** - * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later - * @name: the name of the file to create. - * @parent: parent dentry for this dir. - * - * This function adds eventfs subsystem dir to list. - * And all these dirs are created on the fly when they are looked up, - * and the dentry and inodes will be removed when they are done. - */ -struct eventfs_file *eventfs_add_subsystem_dir(const char *name, - struct dentry *parent) -{ - struct tracefs_inode *ti_parent; - struct eventfs_inode *ei_parent; - struct eventfs_file *ef; - - if (security_locked_down(LOCKDOWN_TRACEFS)) - return NULL; - - if (!parent) - return ERR_PTR(-EINVAL); - - ti_parent = get_tracefs(parent->d_inode); - ei_parent = ti_parent->private; + tracefs_end_creating(dentry); - ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL); - if (IS_ERR(ef)) - return ef; + return ei; - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ei_parent->e_top_files); - ef->d_parent = parent; - mutex_unlock(&eventfs_mutex); - return ef; + fail: + kfree(ei->d_children); + kfree(ei); + fail_ei: + tracefs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); } -/** - * eventfs_add_dir - add eventfs dir to list to create later - * @name: the name of the file to create. - * @ef_parent: parent eventfs_file for this dir. - * - * This function adds eventfs dir to list. - * And all these dirs are created on the fly when they are looked up, - * and the dentry and inodes will be removed when they are done. - */ -struct eventfs_file *eventfs_add_dir(const char *name, - struct eventfs_file *ef_parent) -{ - struct eventfs_file *ef; - - if (security_locked_down(LOCKDOWN_TRACEFS)) - return NULL; - - if (!ef_parent) - return ERR_PTR(-EINVAL); - - ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL); - if (IS_ERR(ef)) - return ef; +static LLIST_HEAD(free_list); - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ef_parent->ei->e_top_files); - ef->d_parent = ef_parent->dentry; - mutex_unlock(&eventfs_mutex); - return ef; -} - -/** - * eventfs_add_events_file - add the data needed to create a file for later reference - * @name: the name of the file to create. - * @mode: the permission that the file should have. - * @parent: parent dentry for this file. - * @data: something that the caller will want to get to later on. - * @fop: struct file_operations that should be used for this file. - * - * This function is used to add the information needed to create a - * dentry/inode within the top level events directory. The file created - * will have the @mode permissions. The @data will be used to fill the - * inode.i_private when the open() call is done. The dentry and inodes are - * all created when they are referenced, and removed when they are no - * longer referenced. - */ -int eventfs_add_events_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fop) +static void eventfs_workfn(struct work_struct *work) { - struct tracefs_inode *ti; - struct eventfs_inode *ei; - struct eventfs_file *ef; - - if (security_locked_down(LOCKDOWN_TRACEFS)) - return -ENODEV; - - if (!parent) - return -EINVAL; - - if (!(mode & S_IFMT)) - mode |= S_IFREG; - - if (!parent->d_inode) - return -EINVAL; - - ti = get_tracefs(parent->d_inode); - if (!(ti->flags & TRACEFS_EVENT_INODE)) - return -EINVAL; - - ei = ti->private; - ef = eventfs_prepare_ef(name, mode, fop, NULL, data); - - if (IS_ERR(ef)) - return -ENOMEM; - - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ei->e_top_files); - ef->d_parent = parent; - mutex_unlock(&eventfs_mutex); - return 0; + struct eventfs_inode *ei, *tmp; + struct llist_node *llnode; + + llnode = llist_del_all(&free_list); + llist_for_each_entry_safe(ei, tmp, llnode, llist) { + /* This dput() matches the dget() from unhook_dentry() */ + for (int i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i]) + dput(ei->d_children[i]); + } + /* This should only get here if it had a dentry */ + if (!WARN_ON_ONCE(!ei->dentry)) + dput(ei->dentry); + } } -/** - * eventfs_add_file - add eventfs file to list to create later - * @name: the name of the file to create. - * @mode: the permission that the file should have. - * @ef_parent: parent eventfs_file for this file. - * @data: something that the caller will want to get to later on. - * @fop: struct file_operations that should be used for this file. - * - * This function is used to add the information needed to create a - * file within a subdirectory of the events directory. The file created - * will have the @mode permissions. The @data will be used to fill the - * inode.i_private when the open() call is done. The dentry and inodes are - * all created when they are referenced, and removed when they are no - * longer referenced. - */ -int eventfs_add_file(const char *name, umode_t mode, - struct eventfs_file *ef_parent, - void *data, - const struct file_operations *fop) -{ - struct eventfs_file *ef; - - if (security_locked_down(LOCKDOWN_TRACEFS)) - return -ENODEV; +static DECLARE_WORK(eventfs_work, eventfs_workfn); - if (!ef_parent) - return -EINVAL; +static void free_rcu_ei(struct rcu_head *head) +{ + struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); - if (!(mode & S_IFMT)) - mode |= S_IFREG; + if (ei->dentry) { + /* Do not free the ei until all references of dentry are gone */ + if (llist_add(&ei->llist, &free_list)) + queue_work(system_unbound_wq, &eventfs_work); + return; + } - ef = eventfs_prepare_ef(name, mode, fop, NULL, data); - if (IS_ERR(ef)) - return -ENOMEM; + /* If the ei doesn't have a dentry, neither should its children */ + for (int i = 0; i < ei->nr_entries; i++) { + WARN_ON_ONCE(ei->d_children[i]); + } - mutex_lock(&eventfs_mutex); - list_add_tail(&ef->list, &ef_parent->ei->e_top_files); - ef->d_parent = ef_parent->dentry; - mutex_unlock(&eventfs_mutex); - return 0; + free_ei(ei); } -static void free_ef(struct rcu_head *head) +static void unhook_dentry(struct dentry *dentry) { - struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu); + if (!dentry) + return; + /* + * Need to add a reference to the dentry that is expected by + * simple_recursive_removal(), which will include a dput(). + */ + dget(dentry); - kfree(ef->name); - kfree(ef->ei); - kfree(ef); + /* + * Also add a reference for the dput() in eventfs_workfn(). + * That is required as that dput() will free the ei after + * the SRCU grace period is over. + */ + dget(dentry); } /** * eventfs_remove_rec - remove eventfs dir or file from list - * @ef: eventfs_file to be removed. - * @head: to create list of eventfs_file to be deleted - * @level: to check recursion depth + * @ei: eventfs_inode to be removed. + * @level: prevent recursion from going more than 3 levels deep. * - * The helper function eventfs_remove_rec() is used to clean up and free the - * associated data from eventfs for both of the added functions. + * This function recursively removes eventfs_inodes which + * contains info of files and/or directories. */ -static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level) +static void eventfs_remove_rec(struct eventfs_inode *ei, int level) { - struct eventfs_file *ef_child; + struct eventfs_inode *ei_child; - if (!ef) + if (!ei) return; /* * Check recursion depth. It should never be greater than 3: @@ -806,100 +1005,76 @@ static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, if (WARN_ON_ONCE(level > 3)) return; - if (ef->ei) { - /* search for nested folders or files */ - list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list, - lockdep_is_held(&eventfs_mutex)) { - eventfs_remove_rec(ef_child, head, level + 1); + /* search for nested folders or files */ + list_for_each_entry_srcu(ei_child, &ei->children, list, + lockdep_is_held(&eventfs_mutex)) { + /* Children only have dentry if parent does */ + WARN_ON_ONCE(ei_child->dentry && !ei->dentry); + eventfs_remove_rec(ei_child, level + 1); + } + + + ei->is_freed = 1; + + for (int i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i]) { + /* Children only have dentry if parent does */ + WARN_ON_ONCE(!ei->dentry); + unhook_dentry(ei->d_children[i]); } } - list_del_rcu(&ef->list); - list_add_tail(&ef->del_list, head); + unhook_dentry(ei->dentry); + + list_del_rcu(&ei->list); + call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); } /** - * eventfs_remove - remove eventfs dir or file from list - * @ef: eventfs_file to be removed. + * eventfs_remove_dir - remove eventfs dir or file from list + * @ei: eventfs_inode to be removed. * * This function acquire the eventfs_mutex lock and call eventfs_remove_rec() */ -void eventfs_remove(struct eventfs_file *ef) +void eventfs_remove_dir(struct eventfs_inode *ei) { - struct eventfs_file *tmp; - LIST_HEAD(ef_del_list); - struct dentry *dentry_list = NULL; struct dentry *dentry; - if (!ef) + if (!ei) return; mutex_lock(&eventfs_mutex); - eventfs_remove_rec(ef, &ef_del_list, 0); - list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { - if (ef->dentry) { - unsigned long ptr = (unsigned long)dentry_list; - - /* Keep the dentry from being freed yet */ - dget(ef->dentry); - - /* - * Paranoid: The dget() above should prevent the dentry - * from being freed and calling eventfs_set_ef_status_free(). - * But just in case, set the link list LSB pointer to 1 - * and have eventfs_set_ef_status_free() check that to - * make sure that if it does happen, it will not think - * the d_fsdata is an event_file. - * - * For this to work, no event_file should be allocated - * on a odd space, as the ef should always be allocated - * to be at least word aligned. Check for that too. - */ - WARN_ON_ONCE(ptr & 1); - - ef->dentry->d_fsdata = (void *)(ptr | 1); - dentry_list = ef->dentry; - ef->dentry = NULL; - } - call_srcu(&eventfs_srcu, &ef->rcu, free_ef); - } + dentry = ei->dentry; + eventfs_remove_rec(ei, 0); mutex_unlock(&eventfs_mutex); - while (dentry_list) { - unsigned long ptr; - - dentry = dentry_list; - ptr = (unsigned long)dentry->d_fsdata & ~1UL; - dentry_list = (struct dentry *)ptr; - dentry->d_fsdata = NULL; - d_invalidate(dentry); - mutex_lock(&eventfs_mutex); - /* dentry should now have at least a single reference */ - WARN_ONCE((int)d_count(dentry) < 1, - "dentry %p less than one reference (%d) after invalidate\n", - dentry, d_count(dentry)); - mutex_unlock(&eventfs_mutex); - dput(dentry); - } + /* + * If any of the ei children has a dentry, then the ei itself + * must have a dentry. + */ + if (dentry) + simple_recursive_removal(dentry, NULL); } /** - * eventfs_remove_events_dir - remove eventfs dir or file from list - * @dentry: events's dentry to be removed. + * eventfs_remove_events_dir - remove the top level eventfs directory + * @ei: the event_inode returned by eventfs_create_events_dir(). * - * This function remove events main directory + * This function removes the events main directory */ -void eventfs_remove_events_dir(struct dentry *dentry) +void eventfs_remove_events_dir(struct eventfs_inode *ei) { - struct tracefs_inode *ti; - - if (!dentry || !dentry->d_inode) - return; + struct dentry *dentry; - ti = get_tracefs(dentry->d_inode); - if (!ti || !(ti->flags & TRACEFS_EVENT_INODE)) - return; + dentry = ei->dentry; + eventfs_remove_dir(ei); - d_invalidate(dentry); + /* + * Matches the dget() done by tracefs_start_creating() + * in eventfs_create_events_dir() when it the dentry was + * created. In other words, it's a normal dentry that + * sticks around while the other ei->dentry are created + * and destroyed dynamically. + */ dput(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 429603d865a9..5b54948514fe 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) ti = get_tracefs(inode); if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ef_status_free(ti, dentry); + eventfs_set_ei_status_free(ti, dentry); iput(inode); } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 4f2e49e2197b..ccee18ca66c7 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -13,6 +13,58 @@ struct tracefs_inode { struct inode vfs_inode; }; +/* + * struct eventfs_attr - cache the mode and ownership of a eventfs entry + * @mode: saved mode plus flags of what is saved + * @uid: saved uid if changed + * @gid: saved gid if changed + */ +struct eventfs_attr { + int mode; + kuid_t uid; + kgid_t gid; +}; + +/* + * struct eventfs_inode - hold the properties of the eventfs directories. + * @list: link list into the parent directory + * @entries: the array of entries representing the files in the directory + * @name: the name of the directory to create + * @children: link list into the child eventfs_inode + * @dentry: the dentry of the directory + * @d_parent: pointer to the parent's dentry + * @d_children: The array of dentries to represent the files when created + * @entry_attrs: Saved mode and ownership of the @d_children + * @attr: Saved mode and ownership of eventfs_inode itself + * @data: The private data to pass to the callbacks + * @is_freed: Flag set if the eventfs is on its way to be freed + * Note if is_freed is set, then dentry is corrupted. + * @nr_entries: The number of items in @entries + */ +struct eventfs_inode { + struct list_head list; + const struct eventfs_entry *entries; + const char *name; + struct list_head children; + struct dentry *dentry; /* Check is_freed to access */ + struct dentry *d_parent; + struct dentry **d_children; + struct eventfs_attr *entry_attrs; + struct eventfs_attr attr; + void *data; + /* + * Union - used for deletion + * @llist: for calling dput() if needed after RCU + * @rcu: eventfs_inode to delete in RCU + */ + union { + struct llist_node llist; + struct rcu_head rcu; + }; + unsigned int is_freed:1; + unsigned int nr_entries:31; +}; + static inline struct tracefs_inode *get_tracefs(const struct inode *inode) { return container_of(inode, struct tracefs_inode, vfs_inode); @@ -25,6 +77,6 @@ struct inode *tracefs_get_inode(struct super_block *sb); struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); struct dentry *eventfs_failed_creating(struct dentry *dentry); struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry); +void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7af442de44c3..3b13c648d490 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -725,7 +725,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_inode *dir_ui = ubifs_inode(dir); - int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, sz_change; struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; struct fscrypt_name nm; @@ -749,6 +749,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, if (err) return err; + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + err = dbg_check_synced_i_size(c, inode); if (err) goto out_fname; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2e65fd2dbdc3..2d2b39f843ce 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1375,6 +1375,9 @@ static inline int mctime_update_needed(const struct inode *inode, /** * ubifs_update_time - update time of inode. * @inode: inode to update + * @time: timespec structure to hold the current time value + * @flags: time updating control flag determines updating + * which time fields of @inode * * This function updates time of the inode. */ diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index d69d2154645b..f0a5538c84b0 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1607,6 +1607,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); ubifs_dump_node(c, dn, dn_size); + err = -EUCLEAN; goto out_free; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0d0478815d4d..09e270d6ed02 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -919,8 +919,10 @@ static void free_buds(struct ubifs_info *c) { struct ubifs_bud *bud, *n; - rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) + rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) { + kfree(bud->log_hash); kfree(bud); + } } /** @@ -1189,6 +1191,7 @@ static void destroy_journal(struct ubifs_info *c) bud = list_entry(c->old_buds.next, struct ubifs_bud, list); list_del(&bud->list); + kfree(bud->log_hash); kfree(bud); } ubifs_destroy_idx_gc(c); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 6b7d95b65f4b..f4728e65d1bd 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -65,6 +65,7 @@ static void do_insert_old_idx(struct ubifs_info *c, else { ubifs_err(c, "old idx added twice!"); kfree(old_idx); + return; } } rb_link_node(&old_idx->rb, parent, p); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 23377c1baed9..a480810cd4e3 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -137,6 +137,7 @@ static struct dentry *ufs_get_parent(struct dentry *child) } static const struct export_operations ufs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ufs_fh_to_dentry, .fh_to_parent = ufs_fh_to_parent, .get_parent = ufs_get_parent, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 30c931b38853..be62acffad6c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -21,7 +21,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" @@ -2989,7 +2989,7 @@ xfs_bmap_extsize_align( * If realtime, and the result isn't a multiple of the realtime * extent size we need to remove blocks until it is. */ - if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) { + if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) { /* * We're not covering the original request, or * we won't be able to once we fix the length. @@ -3016,7 +3016,7 @@ xfs_bmap_extsize_align( else { align_alen -= orig_off - align_off; align_off = orig_off; - align_alen -= align_alen % mp->m_sb.sb_rextsize; + align_alen -= xfs_extlen_to_rtxmod(mp, align_alen); } /* * Result doesn't cover the request, fail it. @@ -4826,12 +4826,8 @@ xfs_bmap_del_extent_delay( ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); - if (isrt) { - uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); - - do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_frextents(mp, rtexts); - } + if (isrt) + xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); /* * Update the inode delalloc counter now and wait to update the @@ -5057,33 +5053,20 @@ xfs_bmap_del_extent_real( flags = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_filblks_t len; - xfs_extlen_t mod; - - len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); - if (!(bflags & XFS_BMAPI_REMAP)) { - xfs_fsblock_t bno; - - bno = div_u64_rem(del->br_startblock, - mp->m_sb.sb_rextsize, &mod); - ASSERT(mod == 0); - - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + error = xfs_rtfree_blocks(tp, del->br_startblock, + del->br_blockcount); if (error) goto done; } do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; } else { do_fx = 1; - nblks = del->br_blockcount; qfield = XFS_TRANS_DQ_BCOUNT; } + nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; if (cur) { @@ -5289,7 +5272,6 @@ __xfs_bunmapi( int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ - xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t end; struct xfs_iext_cursor icur; @@ -5384,8 +5366,8 @@ __xfs_bunmapi( if (!isrt) goto delete; - sum = del.br_startblock + del.br_blockcount; - div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod); + mod = xfs_rtb_to_rtxoff(mp, + del.br_startblock + del.br_blockcount); if (mod) { /* * Realtime extent not lined up at the end. @@ -5432,7 +5414,8 @@ __xfs_bunmapi( goto error0; goto nodelete; } - div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod); + + mod = xfs_rtb_to_rtxoff(mp, del.br_startblock); if (mod) { xfs_extlen_t off = mp->m_sb.sb_rextsize - mod; @@ -6209,8 +6192,8 @@ xfs_bmap_validate_extent( return __this_address; if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { - if (!xfs_verify_rtext(mp, irec->br_startblock, - irec->br_blockcount)) + if (!xfs_verify_rtbext(mp, irec->br_startblock, + irec->br_blockcount)) return __this_address; } else { if (!xfs_verify_fsbext(mp, irec->br_startblock, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 371dc07233e0..9a88aba1589f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -98,7 +98,7 @@ typedef struct xfs_sb { uint32_t sb_blocksize; /* logical block size, bytes */ xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ - xfs_rtblock_t sb_rextents; /* number of realtime extents */ + xfs_rtbxlen_t sb_rextents; /* number of realtime extents */ uuid_t sb_uuid; /* user-visible file system unique id */ xfs_fsblock_t sb_logstart; /* starting block of log if internal */ xfs_ino_t sb_rootino; /* root inode number */ @@ -691,6 +691,22 @@ struct xfs_agfl { xfs_daddr_to_agno(mp, (d) + (len) - 1))) /* + * Realtime bitmap information is accessed by the word, which is currently + * stored in host-endian format. + */ +union xfs_rtword_raw { + __u32 old; +}; + +/* + * Realtime summary counts are accessed by the word, which is currently + * stored in host-endian format. + */ +union xfs_suminfo_raw { + __u32 old; +}; + +/* * XFS Timestamps * ============== * @@ -1142,24 +1158,10 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) #define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) -#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) -#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) /* - * RT Summary and bit manipulation macros. + * RT bit manipulation macros. */ -#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) -#define XFS_SUMOFFSTOBLOCK(mp,s) \ - (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) -#define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((bp)->b_addr + \ - (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) - -#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) -#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) -#define XFS_BITTOWORD(mp,bi) \ - ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) - #define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) #define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 396648acb5be..c269d704314d 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -16,6 +16,7 @@ #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_error.h" +#include "xfs_rtbitmap.h" /* * Realtime allocator bitmap functions shared with userspace. @@ -46,25 +47,69 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { .verify_write = xfs_rtbuf_verify_write, }; +/* Release cached rt bitmap and summary buffers. */ +void +xfs_rtbuf_cache_relse( + struct xfs_rtalloc_args *args) +{ + if (args->rbmbp) { + xfs_trans_brelse(args->tp, args->rbmbp); + args->rbmbp = NULL; + args->rbmoff = NULLFILEOFF; + } + if (args->sumbp) { + xfs_trans_brelse(args->tp, args->sumbp); + args->sumbp = NULL; + args->sumoff = NULLFILEOFF; + } +} + /* * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ int xfs_rtbuf_get( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t block, /* block number in bitmap or summary */ - int issum, /* is summary not bitmap */ - struct xfs_buf **bpp) /* output: buffer for the block */ + struct xfs_rtalloc_args *args, + xfs_fileoff_t block, /* block number in bitmap or summary */ + int issum) /* is summary not bitmap */ { - struct xfs_buf *bp; /* block buffer, result */ - xfs_inode_t *ip; /* bitmap or summary inode */ - xfs_bmbt_irec_t map; - int nmap = 1; - int error; /* error value */ + struct xfs_mount *mp = args->mp; + struct xfs_buf **cbpp; /* cached block buffer */ + xfs_fileoff_t *coffp; /* cached block number */ + struct xfs_buf *bp; /* block buffer, result */ + struct xfs_inode *ip; /* bitmap or summary inode */ + struct xfs_bmbt_irec map; + enum xfs_blft type; + int nmap = 1; + int error; - ip = issum ? mp->m_rsumip : mp->m_rbmip; + if (issum) { + cbpp = &args->sumbp; + coffp = &args->sumoff; + ip = mp->m_rsumip; + type = XFS_BLFT_RTSUMMARY_BUF; + } else { + cbpp = &args->rbmbp; + coffp = &args->rbmoff; + ip = mp->m_rbmip; + type = XFS_BLFT_RTBITMAP_BUF; + } + + /* + * If we have a cached buffer, and the block number matches, use that. + */ + if (*cbpp && *coffp == block) + return 0; + + /* + * Otherwise we have to have to get the buffer. If there was an old + * one, get rid of it first. + */ + if (*cbpp) { + xfs_trans_brelse(args->tp, *cbpp); + *cbpp = NULL; + } error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0); if (error) @@ -74,15 +119,15 @@ xfs_rtbuf_get( return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); if (error) return error; - xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF - : XFS_BLFT_RTBITMAP_BUF); - *bpp = bp; + xfs_trans_buf_set_type(args->tp, bp, type); + *cbpp = bp; + *coffp = block; return 0; } @@ -92,47 +137,44 @@ xfs_rtbuf_get( */ int xfs_rtfind_back( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to look at */ + xfs_rtxnum_t limit, /* last rtext to look at */ + xfs_rtxnum_t *rtx) /* out: start rtext found */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t firstbit; /* first useful bit in the word */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; /* error value */ + xfs_rtxnum_t firstbit; /* first useful bit in the word */ + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute and read in starting bitmap block for starting block. */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + block = xfs_rtx_to_rbmblock(mp, start); + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Get the first word's index & point to it. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); len = start - limit + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + incore = xfs_rtbitmap_getword(args, word); + want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0; /* * If the starting position is not word-aligned, deal with the * partial word. @@ -149,13 +191,12 @@ xfs_rtfind_back( * Calculate the difference between the value there * and what we're looking for. */ - if ((wdiff = (*b ^ want) & mask)) { + if ((wdiff = (incore ^ want) & mask)) { /* * Different. Mark where we are and return. */ - xfs_trans_brelse(tp, bp); i = bit - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } i = bit - firstbit + 1; @@ -167,19 +208,11 @@ xfs_rtfind_back( /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, --block); + if (error) return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; + + word = mp->m_blockwsize - 1; } } else { /* @@ -195,13 +228,13 @@ xfs_rtfind_back( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ want)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ want)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } i += XFS_NBWORD; @@ -213,19 +246,11 @@ xfs_rtfind_back( /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, --block); + if (error) return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; + + word = mp->m_blockwsize - 1; } } /* @@ -242,13 +267,13 @@ xfs_rtfind_back( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ want) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ want) & mask)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } else i = len; @@ -256,8 +281,7 @@ xfs_rtfind_back( /* * No match, return that we scanned the whole area. */ - xfs_trans_brelse(tp, bp); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } @@ -267,47 +291,44 @@ xfs_rtfind_back( */ int xfs_rtfind_forw( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to look at */ + xfs_rtxnum_t limit, /* last rtext to look at */ + xfs_rtxnum_t *rtx) /* out: start rtext found */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in the word */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t lastbit;/* last useful bit in the word */ + xfs_rtxnum_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute and read in starting bitmap block for starting block. */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + block = xfs_rtx_to_rbmblock(mp, start); + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Get the first word's index & point to it. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); len = limit - start + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + incore = xfs_rtbitmap_getword(args, word); + want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0; /* * If the starting position is not word-aligned, deal with the * partial word. @@ -323,13 +344,12 @@ xfs_rtfind_forw( * Calculate the difference between the value there * and what we're looking for. */ - if ((wdiff = (*b ^ want) & mask)) { + if ((wdiff = (incore ^ want) & mask)) { /* * Different. Mark where we are and return. */ - xfs_trans_brelse(tp, bp); i = XFS_RTLOBIT(wdiff) - bit; - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } i = lastbit - bit; @@ -337,22 +357,15 @@ xfs_rtfind_forw( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the previous word in the buffer. - */ - b++; } } else { /* @@ -368,13 +381,13 @@ xfs_rtfind_forw( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ want)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ want)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } i += XFS_NBWORD; @@ -382,22 +395,15 @@ xfs_rtfind_forw( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } /* @@ -412,13 +418,13 @@ xfs_rtfind_forw( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ want) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ want) & mask)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } else i = len; @@ -426,11 +432,25 @@ xfs_rtfind_forw( /* * No match, return that we scanned the whole area. */ - xfs_trans_brelse(tp, bp); - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } +/* Log rtsummary counter at @infoword. */ +static inline void +xfs_trans_log_rtsummary( + struct xfs_rtalloc_args *args, + unsigned int infoword) +{ + struct xfs_buf *bp = args->sumbp; + size_t first, last; + + first = (void *)xfs_rsumblock_infoptr(args, infoword) - bp->b_addr; + last = first + sizeof(xfs_suminfo_t) - 1; + + xfs_trans_log_buf(args->tp, bp, first, last); +} + /* * Read and/or modify the summary information for a given extent size, * bitmap block combination. @@ -442,86 +462,77 @@ xfs_rtfind_forw( */ int xfs_rtmodify_summary_int( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { - struct xfs_buf *bp; /* buffer for the summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ + struct xfs_mount *mp = args->mp; + int error; + xfs_fileoff_t sb; /* summary fsblock */ + xfs_rtsumoff_t so; /* index into the summary file */ + unsigned int infoword; /* * Compute entry number in the summary file. */ - so = XFS_SUMOFFS(mp, log, bbno); + so = xfs_rtsumoffs(mp, log, bbno); /* * Compute the block number in the summary file. */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (*rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (*rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - *rbpp = bp; - *rsb = sb; - } + sb = xfs_rtsumoffs_to_block(mp, so); + + error = xfs_rtsummary_read_buf(args, sb); + if (error) + return error; + /* * Point to the summary information, modify/log it, and/or copy it out. */ - sp = XFS_SUMPTR(mp, bp, so); + infoword = xfs_rtsumoffs_to_infoword(mp, so); if (delta) { - uint first = (uint)((char *)sp - (char *)bp->b_addr); + xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta); - *sp += delta; if (mp->m_rsum_cache) { - if (*sp == 0 && log == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno]++; - if (*sp != 0 && log < mp->m_rsum_cache[bbno]) + if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) mp->m_rsum_cache[bbno] = log; + if (val != 0 && log >= mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; } - xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); + xfs_trans_log_rtsummary(args, infoword); + if (sum) + *sum = val; + } else if (sum) { + *sum = xfs_suminfo_get(args, infoword); } - if (sum) - *sum = *sp; return 0; } int xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int delta) /* in/out: summary block number */ +{ + return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL); +} + +/* Log rtbitmap block from the word @from to the byte before @next. */ +static inline void +xfs_trans_log_rtbitmap( + struct xfs_rtalloc_args *args, + unsigned int from, + unsigned int next) { - return xfs_rtmodify_summary_int(mp, tp, log, bbno, - delta, rbpp, rsb, NULL); + struct xfs_buf *bp = args->rbmbp; + size_t first, last; + + first = (void *)xfs_rbmblock_wordptr(args, from) - bp->b_addr; + last = ((void *)xfs_rbmblock_wordptr(args, next) - 1) - bp->b_addr; + + xfs_trans_log_buf(args->tp, bp, first, last); } /* @@ -530,41 +541,37 @@ xfs_rtmodify_summary( */ int xfs_rtmodify_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to modify */ - xfs_extlen_t len, /* length of extent to modify */ - int val) /* 1 for free, 0 for allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to modify */ + xfs_rtxlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtword_t *first; /* first used word in the buffer */ - int i; /* current bit number rel. to start */ - int lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask o frelevant bits for value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t incore; + unsigned int firstword; /* first word used in the buffer */ + unsigned int word; /* word number in the buffer */ /* * Compute starting bitmap block number. */ - block = XFS_BITTOBLOCK(mp, start); + block = xfs_rtx_to_rbmblock(mp, start); /* * Read the bitmap block, and point to its data. */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Compute the starting word's address, and starting bit. */ - word = XFS_BITTOWORD(mp, start); - first = b = &bufp[word]; + firstword = word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); /* * 0 (allocated) => all zeroes; 1 (free) => all ones. @@ -583,34 +590,28 @@ xfs_rtmodify_range( /* * Set/clear the active bits. */ + incore = xfs_rtbitmap_getword(args, word); if (val) - *b |= mask; + incore |= mask; else - *b &= ~mask; + incore &= ~mask; + xfs_rtbitmap_setword(args, word, incore); i = lastbit - bit; /* * Go on to the next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * Log the changed part of this block. * Get the next one. */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + xfs_trans_log_rtbitmap(args, firstword, word); + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; + + firstword = word = 0; } } else { /* @@ -626,31 +627,23 @@ xfs_rtmodify_range( /* * Set the word value correctly. */ - *b = val; + xfs_rtbitmap_setword(args, word, val); i += XFS_NBWORD; /* * Go on to the next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * Log the changed part of this block. * Get the next one. */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + xfs_trans_log_rtbitmap(args, firstword, word); + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; + + firstword = word = 0; } } /* @@ -665,18 +658,19 @@ xfs_rtmodify_range( /* * Set/clear the active bits. */ + incore = xfs_rtbitmap_getword(args, word); if (val) - *b |= mask; + incore |= mask; else - *b &= ~mask; - b++; + incore &= ~mask; + xfs_rtbitmap_setword(args, word, incore); + word++; } /* * Log any remaining changed bytes. */ - if (b > first) - xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp - 1)); + if (word > firstword) + xfs_trans_log_rtbitmap(args, firstword, word); return 0; } @@ -686,23 +680,21 @@ xfs_rtmodify_range( */ int xfs_rtfree_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to free */ - xfs_extlen_t len, /* length to free */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to free */ + xfs_rtxlen_t len) /* in/out: summary block number */ { - xfs_rtblock_t end; /* end of the freed extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block freed > end */ - xfs_rtblock_t preblock; /* first block freed < start */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtxnum_t postblock; /* first rtext freed > end */ + xfs_rtxnum_t preblock; /* first rtext freed < start */ end = start + len - 1; /* * Modify the bitmap to mark this extent freed. */ - error = xfs_rtmodify_range(mp, tp, start, len, 1); + error = xfs_rtmodify_range(args, start, len, 1); if (error) { return error; } @@ -711,15 +703,15 @@ xfs_rtfree_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + error = xfs_rtfind_back(args, start, 0, &preblock); if (error) { return error; } /* * Find the next allocated block (end of allocated extent). */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); + error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + &postblock); if (error) return error; /* @@ -727,9 +719,9 @@ xfs_rtfree_range( * old extent, add summary data for them to be allocated. */ if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(start - preblock), + xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; } @@ -739,9 +731,9 @@ xfs_rtfree_range( * old extent, add summary data for them to be allocated. */ if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock - end), + xfs_rtx_to_rbmblock(mp, end + 1), -1); if (error) { return error; } @@ -750,10 +742,9 @@ xfs_rtfree_range( * Increment the summary information corresponding to the entire * (new) free extent. */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); - return error; + return xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_rtx_to_rbmblock(mp, preblock), 1); } /* @@ -762,43 +753,39 @@ xfs_rtfree_range( */ int xfs_rtcheck_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int val, /* 1 for free, 0 for allocated */ - xfs_rtblock_t *new, /* out: first block not matching */ - int *stat) /* out: 1 for matches, 0 for not */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number of extent */ + xfs_rtxlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtxnum_t *new, /* out: first rtext not matching */ + int *stat) /* out: 1 for matches, 0 for not */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute starting bitmap block number */ - block = XFS_BITTOBLOCK(mp, start); + block = xfs_rtx_to_rbmblock(mp, start); /* * Read the bitmap block. */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Compute the starting word's address, and starting bit. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); /* * 0 (allocated) => all zero's; 1 (free) => all one's. @@ -820,11 +807,11 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ val) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ val) & mask)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); i = XFS_RTLOBIT(wdiff) - bit; *new = start + i; *stat = 0; @@ -835,22 +822,15 @@ xfs_rtcheck_range( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } else { /* @@ -866,11 +846,11 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ val)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ val)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); *new = start + i; *stat = 0; @@ -881,22 +861,15 @@ xfs_rtcheck_range( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } /* @@ -911,11 +884,11 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ val) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ val) & mask)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); *new = start + i; *stat = 0; @@ -926,7 +899,6 @@ xfs_rtcheck_range( /* * Successful, return. */ - xfs_trans_brelse(tp, bp); *new = start + i; *stat = 1; return 0; @@ -936,58 +908,57 @@ xfs_rtcheck_range( /* * Check that the given extent (block range) is allocated already. */ -STATIC int /* error */ +STATIC int xfs_rtcheck_alloc_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number of extent */ + xfs_rtxlen_t len) /* length of extent */ { - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - int stat; - int error; + xfs_rtxnum_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; - error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + error = xfs_rtcheck_range(args, start, len, 0, &new, &stat); if (error) return error; ASSERT(stat); return 0; } #else -#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#define xfs_rtcheck_alloc_range(a,b,l) (0) #endif /* * Free an extent in the realtime subvolume. Length is expressed in * realtime extents, as is the block number. */ -int /* error */ +int xfs_rtfree_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len) /* length of extent freed */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtxnum_t start, /* starting rtext number to free */ + xfs_rtxlen_t len) /* length of extent freed */ { - int error; /* error value */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_fsblock_t sb; /* summary file block number */ - struct xfs_buf *sumbp = NULL; /* summary file block buffer */ - struct timespec64 atime; - - mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; + int error; + struct timespec64 atime; ASSERT(mp->m_rbmip->i_itemp != NULL); ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + error = xfs_rtcheck_alloc_range(&args, start, len); if (error) return error; /* * Free the range of realtime blocks. */ - error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); - if (error) { - return error; - } + error = xfs_rtfree_range(&args, start, len); + if (error) + goto out; + /* * Mark more blocks free in the superblock. */ @@ -1002,11 +973,47 @@ xfs_rtfree_extent( mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; atime = inode_get_atime(VFS_I(mp->m_rbmip)); - *((uint64_t *)&atime) = 0; + atime.tv_sec = 0; inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } - return 0; + error = 0; +out: + xfs_rtbuf_cache_relse(&args); + return error; +} + +/* + * Free some blocks in the realtime subvolume. rtbno and rtlen are in units of + * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen + * cannot exceed XFS_MAX_BMBT_EXTLEN. + */ +int +xfs_rtfree_blocks( + struct xfs_trans *tp, + xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rtxnum_t start; + xfs_filblks_t len; + xfs_extlen_t mod; + + ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); + + len = xfs_rtb_to_rtxrem(mp, rtlen, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + start = xfs_rtb_to_rtxrem(mp, rtbno, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + return xfs_rtfree_extent(tp, start, len); } /* Find all the free records within a given range. */ @@ -1019,10 +1026,14 @@ xfs_rtalloc_query_range( xfs_rtalloc_query_range_fn fn, void *priv) { + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; struct xfs_rtalloc_rec rec; - xfs_rtblock_t rtstart; - xfs_rtblock_t rtend; - xfs_rtblock_t high_key; + xfs_rtxnum_t rtstart; + xfs_rtxnum_t rtend; + xfs_rtxnum_t high_key; int is_free; int error = 0; @@ -1038,13 +1049,13 @@ xfs_rtalloc_query_range( rtstart = low_rec->ar_startext; while (rtstart <= high_key) { /* Is the first block free? */ - error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, + error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend, &is_free); if (error) break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend); + error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend); if (error) break; @@ -1060,6 +1071,7 @@ xfs_rtalloc_query_range( rtstart = rtend + 1; } + xfs_rtbuf_cache_relse(&args); return error; } @@ -1085,18 +1097,79 @@ int xfs_rtalloc_extent_is_free( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, - xfs_extlen_t len, + xfs_rtxnum_t start, + xfs_rtxlen_t len, bool *is_free) { - xfs_rtblock_t end; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; + xfs_rtxnum_t end; int matches; int error; - error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches); + error = xfs_rtcheck_range(&args, start, len, 1, &end, &matches); + xfs_rtbuf_cache_relse(&args); if (error) return error; *is_free = matches; return 0; } + +/* + * Compute the number of rtbitmap blocks needed to track the given number of rt + * extents. + */ +xfs_filblks_t +xfs_rtbitmap_blockcount( + struct xfs_mount *mp, + xfs_rtbxlen_t rtextents) +{ + return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); +} + +/* + * Compute the number of rtbitmap words needed to populate every block of a + * bitmap that is large enough to track the given number of rt extents. + */ +unsigned long long +xfs_rtbitmap_wordcount( + struct xfs_mount *mp, + xfs_rtbxlen_t rtextents) +{ + xfs_filblks_t blocks; + + blocks = xfs_rtbitmap_blockcount(mp, rtextents); + return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; +} + +/* Compute the number of rtsummary blocks needed to track the given rt space. */ +xfs_filblks_t +xfs_rtsummary_blockcount( + struct xfs_mount *mp, + unsigned int rsumlevels, + xfs_extlen_t rbmblocks) +{ + unsigned long long rsumwords; + + rsumwords = (unsigned long long)rsumlevels * rbmblocks; + return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); +} + +/* + * Compute the number of rtsummary info words needed to populate every block of + * a summary file that is large enough to track the given rt space. + */ +unsigned long long +xfs_rtsummary_wordcount( + struct xfs_mount *mp, + unsigned int rsumlevels, + xfs_extlen_t rbmblocks) +{ + xfs_filblks_t blocks; + + blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); + return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h new file mode 100644 index 000000000000..c0637057d69c --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_RTBITMAP_H__ +#define __XFS_RTBITMAP_H__ + +struct xfs_rtalloc_args { + struct xfs_mount *mp; + struct xfs_trans *tp; + + struct xfs_buf *rbmbp; /* bitmap block buffer */ + struct xfs_buf *sumbp; /* summary block buffer */ + + xfs_fileoff_t rbmoff; /* bitmap block number */ + xfs_fileoff_t sumoff; /* summary block number */ +}; + +static inline xfs_rtblock_t +xfs_rtx_to_rtb( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + if (mp->m_rtxblklog >= 0) + return rtx << mp->m_rtxblklog; + + return rtx * mp->m_sb.sb_rextsize; +} + +static inline xfs_extlen_t +xfs_rtxlen_to_extlen( + struct xfs_mount *mp, + xfs_rtxlen_t rtxlen) +{ + if (mp->m_rtxblklog >= 0) + return rtxlen << mp->m_rtxblklog; + + return rtxlen * mp->m_sb.sb_rextsize; +} + +/* Compute the misalignment between an extent length and a realtime extent .*/ +static inline unsigned int +xfs_extlen_to_rtxmod( + struct xfs_mount *mp, + xfs_extlen_t len) +{ + if (mp->m_rtxblklog >= 0) + return len & mp->m_rtxblkmask; + + return len % mp->m_sb.sb_rextsize; +} + +static inline xfs_rtxlen_t +xfs_extlen_to_rtxlen( + struct xfs_mount *mp, + xfs_extlen_t len) +{ + if (mp->m_rtxblklog >= 0) + return len >> mp->m_rtxblklog; + + return len / mp->m_sb.sb_rextsize; +} + +/* Convert an rt block number into an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rtbno >> mp->m_rtxblklog; + + return div_u64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Return the offset of an rt block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rtb_to_rtxoff( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rtbno & mp->m_rtxblkmask; + + return do_div(rtbno, mp->m_sb.sb_rextsize); +} + +/* + * Crack an rt block number into an rt extent number and an offset within that + * rt extent. Returns the rt extent number directly and the offset in @off. + */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtxrem( + struct xfs_mount *mp, + xfs_rtblock_t rtbno, + xfs_extlen_t *off) +{ + if (likely(mp->m_rtxblklog >= 0)) { + *off = rtbno & mp->m_rtxblkmask; + return rtbno >> mp->m_rtxblklog; + } + + return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off); +} + +/* + * Convert an rt block number into an rt extent number, rounding up to the next + * rt extent if the rt block is not aligned to an rt extent boundary. + */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtxup( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) { + if (rtbno & mp->m_rtxblkmask) + return (rtbno >> mp->m_rtxblklog) + 1; + return rtbno >> mp->m_rtxblklog; + } + + if (do_div(rtbno, mp->m_sb.sb_rextsize)) + rtbno++; + return rtbno; +} + +/* Round this rtblock up to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_rtb_roundup_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return roundup_64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Round this rtblock down to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_rtb_rounddown_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return rounddown_64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Convert an rt extent number to a file block offset in the rt bitmap file. */ +static inline xfs_fileoff_t +xfs_rtx_to_rbmblock( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return rtx >> mp->m_blkbit_log; +} + +/* Convert an rt extent number to a word offset within an rt bitmap block. */ +static inline unsigned int +xfs_rtx_to_rbmword( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); +} + +/* Convert a file block offset in the rt bitmap file to an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rbmblock_to_rtx( + struct xfs_mount *mp, + xfs_fileoff_t rbmoff) +{ + return rbmoff << mp->m_blkbit_log; +} + +/* Return a pointer to a bitmap word within a rt bitmap block. */ +static inline union xfs_rtword_raw * +xfs_rbmblock_wordptr( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_rtword_raw *words = args->rbmbp->b_addr; + + return words + index; +} + +/* Convert an ondisk bitmap word to its incore representation. */ +static inline xfs_rtword_t +xfs_rtbitmap_getword( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + + return word->old; +} + +/* Set an ondisk bitmap word from an incore representation. */ +static inline void +xfs_rtbitmap_setword( + struct xfs_rtalloc_args *args, + unsigned int index, + xfs_rtword_t value) +{ + union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + + word->old = value; +} + +/* + * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t + * offset within the rt summary file. + */ +static inline xfs_rtsumoff_t +xfs_rtsumoffs( + struct xfs_mount *mp, + int log2_len, + xfs_fileoff_t rbmoff) +{ + return log2_len * mp->m_sb.sb_rbmblocks + rbmoff; +} + +/* + * Convert an xfs_suminfo_t offset to a file block offset within the rt summary + * file. + */ +static inline xfs_fileoff_t +xfs_rtsumoffs_to_block( + struct xfs_mount *mp, + xfs_rtsumoff_t rsumoff) +{ + return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); +} + +/* + * Convert an xfs_suminfo_t offset to an info word offset within an rt summary + * block. + */ +static inline unsigned int +xfs_rtsumoffs_to_infoword( + struct xfs_mount *mp, + xfs_rtsumoff_t rsumoff) +{ + unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; + + return rsumoff & mask; +} + +/* Return a pointer to a summary info word within a rt summary block. */ +static inline union xfs_suminfo_raw * +xfs_rsumblock_infoptr( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_suminfo_raw *info = args->sumbp->b_addr; + + return info + index; +} + +/* Get the current value of a summary counter. */ +static inline xfs_suminfo_t +xfs_suminfo_get( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + + return info->old; +} + +/* Add to the current value of a summary counter and return the new value. */ +static inline xfs_suminfo_t +xfs_suminfo_add( + struct xfs_rtalloc_args *args, + unsigned int index, + int delta) +{ + union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + + info->old += delta; + return info->old; +} + +/* + * Functions for walking free space rtextents in the realtime bitmap. + */ +struct xfs_rtalloc_rec { + xfs_rtxnum_t ar_startext; + xfs_rtbxlen_t ar_extcount; +}; + +typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv); + +#ifdef CONFIG_XFS_RT +void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); + +int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block, + int issum); + +static inline int +xfs_rtbitmap_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + return xfs_rtbuf_get(args, block, 0); +} + +static inline int +xfs_rtsummary_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + return xfs_rtbuf_get(args, block, 1); +} + +int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); +int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); +int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); +int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len, int val); +int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum); +int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, int delta); +int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len); +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, void *priv); +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv); +int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxlen_t len, + bool *is_free); +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtxnum_t start, /* starting rtext number to free */ + xfs_rtxlen_t len); /* length of extent freed */ + +/* Same as above, but in units of rt blocks. */ +int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen); + +xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t + rtextents); +unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, + xfs_rtbxlen_t rtextents); + +xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, + unsigned int rsumlevels, xfs_extlen_t rbmblocks); +unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, + unsigned int rsumlevels, xfs_extlen_t rbmblocks); +#else /* CONFIG_XFS_RT */ +# define xfs_rtfree_extent(t,b,l) (-ENOSYS) +# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) +# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) +# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) +# define xfs_rtsummary_read_buf(a,b) (-ENOSYS) +# define xfs_rtbuf_cache_relse(a) (0) +# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +static inline xfs_filblks_t +xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) +{ + /* shut up gcc */ + return 0; +} +# define xfs_rtbitmap_wordcount(mp, r) (0) +# define xfs_rtsummary_blockcount(mp, l, b) (0) +# define xfs_rtsummary_wordcount(mp, l, b) (0) +#endif /* CONFIG_XFS_RT */ + +#endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 6264daaab37b..1f74d0cd1618 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -975,6 +975,8 @@ xfs_sb_mount_common( mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index a5e14740ec9a..19134b23c10b 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -25,7 +25,7 @@ extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); -#define XFS_FS_GEOM_MAX_STRUCT_VER (4) +#define XFS_FS_GEOM_MAX_STRUCT_VER (5) extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5b2f27cbdb80..6cd45e8c118d 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -19,6 +19,7 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_trans_space.h" +#include "xfs_rtbitmap.h" #define _ALLOC true #define _FREE false @@ -217,11 +218,12 @@ xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); - unsigned int rtbmp_bytes; + unsigned int rtbmp_blocks; + xfs_rtxlen_t rtxlen; - rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; - return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; + rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); + rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen); + return (rtbmp_blocks + 1) * num_ops; } /* diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 5c2765934732..c299b16c9365 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -148,10 +148,10 @@ xfs_verify_rtbno( /* Verify that a realtime device extent is fully contained inside the volume. */ bool -xfs_verify_rtext( +xfs_verify_rtbext( struct xfs_mount *mp, xfs_rtblock_t rtbno, - xfs_rtblock_t len) + xfs_filblks_t len) { if (rtbno + len <= rtbno) return false; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 851220021484..533200c4ccc2 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -11,6 +11,7 @@ typedef uint32_t prid_t; /* project ID */ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ typedef uint64_t xfs_extnum_t; /* # of extents in a file */ typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ @@ -18,6 +19,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef uint32_t xfs_rtsumoff_t; /* offset of an rtsummary info word */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ @@ -31,6 +33,8 @@ typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ typedef uint64_t xfs_fileoff_t; /* block number in a file */ typedef uint64_t xfs_filblks_t; /* number of blocks in a file */ +typedef uint64_t xfs_rtxnum_t; /* rtextent number */ +typedef uint64_t xfs_rtbxlen_t; /* rtbitmap extent length in rtextents */ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ @@ -47,6 +51,7 @@ typedef void * xfs_failaddr_t; #define NULLRFSBLOCK ((xfs_rfsblock_t)-1) #define NULLRTBLOCK ((xfs_rtblock_t)-1) #define NULLFILEOFF ((xfs_fileoff_t)-1) +#define NULLRTEXTNO ((xfs_rtxnum_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) @@ -145,6 +150,7 @@ typedef uint32_t xfs_dqid_t; */ #define XFS_NBBYLOG 3 /* log2(NBBY) */ #define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_SUMINFOLOG 2 /* log2(sizeof(xfs_suminfo_t)) */ #define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) #define XFS_NBWORD (1 << XFS_NBWORDLOG) #define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) @@ -229,8 +235,8 @@ bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); -bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno, - xfs_rtblock_t len); +bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno, + xfs_filblks_t len); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min, diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 75588915572e..06d8c1996a33 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -410,7 +410,7 @@ xchk_bmap_iextent( /* Make sure the extent points to a valid place. */ if (info->is_rt && - !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) + !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (!info->is_rt && diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 05be757668bb..5799e9a94f1f 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -16,7 +16,7 @@ #include "xfs_health.h" #include "xfs_btree.h" #include "xfs_ag.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "scrub/scrub.h" diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 59d7912fb75f..889f556bc98f 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -20,6 +20,7 @@ #include "xfs_reflink.h" #include "xfs_rmap.h" #include "xfs_bmap_util.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" @@ -225,7 +226,7 @@ xchk_inode_extsize( */ if ((flags & XFS_DIFLAG_RTINHERIT) && (flags & XFS_DIFLAG_EXTSZINHERIT) && - value % sc->mp->m_sb.sb_rextsize > 0) + xfs_extlen_to_rtxmod(sc->mp, value) > 0) xchk_ino_set_warning(sc, ino); } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 008ddb599e13..41a1d89ae8e6 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -11,7 +11,7 @@ #include "xfs_mount.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "scrub/scrub.h" @@ -48,12 +48,12 @@ xchk_rtbitmap_rec( { struct xfs_scrub *sc = priv; xfs_rtblock_t startblock; - xfs_rtblock_t blockcount; + xfs_filblks_t blockcount; - startblock = rec->ar_startext * mp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; + startblock = xfs_rtx_to_rtb(mp, rec->ar_startext); + blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount); - if (!xfs_verify_rtext(mp, startblock, blockcount)) + if (!xfs_verify_rtbext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -128,26 +128,22 @@ out: void xchk_xref_is_used_rt_space( struct xfs_scrub *sc, - xfs_rtblock_t fsbno, + xfs_rtblock_t rtbno, xfs_extlen_t len) { - xfs_rtblock_t startext; - xfs_rtblock_t endext; - xfs_rtblock_t extcount; + xfs_rtxnum_t startext; + xfs_rtxnum_t endext; bool is_free; int error; if (xchk_skip_xref(sc->sm)) return; - startext = fsbno; - endext = fsbno + len - 1; - do_div(startext, sc->mp->m_sb.sb_rextsize); - do_div(endext, sc->mp->m_sb.sb_rextsize); - extcount = endext - startext + 1; + startext = xfs_rtb_to_rtx(sc->mp, rtbno); + endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, - &is_free); + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, + endext - startext + 1, &is_free); if (!xchk_should_check_xref(sc, &error, NULL)) goto out_unlock; if (is_free) diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 437ed9acbb27..8b15c47408d0 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -13,7 +13,7 @@ #include "xfs_inode.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_bit.h" #include "xfs_bmap.h" #include "scrub/scrub.h" @@ -81,34 +81,45 @@ typedef unsigned int xchk_rtsumoff_t; static inline int xfsum_load( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - xfs_suminfo_t *info) + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo) { - return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t), + return xfile_obj_load(sc->xfile, rawinfo, + sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } static inline int xfsum_store( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - const xfs_suminfo_t info) + xfs_rtsumoff_t sumoff, + const union xfs_suminfo_raw rawinfo) { - return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t), + return xfile_obj_store(sc->xfile, &rawinfo, + sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } static inline int xfsum_copyout( struct xfs_scrub *sc, - xchk_rtsumoff_t sumoff, - xfs_suminfo_t *info, + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words) { - return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG, + return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, sumoff << XFS_WORDLOG); } +static inline xfs_suminfo_t +xchk_rtsum_inc( + struct xfs_mount *mp, + union xfs_suminfo_raw *v) +{ + v->old += 1; + return v->old; +} + /* Update the summary file to reflect the free extent that we've accumulated. */ STATIC int xchk_rtsum_record_free( @@ -121,23 +132,24 @@ xchk_rtsum_record_free( xfs_fileoff_t rbmoff; xfs_rtblock_t rtbno; xfs_filblks_t rtlen; - xchk_rtsumoff_t offs; + xfs_rtsumoff_t offs; unsigned int lenlog; - xfs_suminfo_t v = 0; + union xfs_suminfo_raw v; + xfs_suminfo_t value; int error = 0; if (xchk_should_terminate(sc, &error)) return error; /* Compute the relevant location in the rtsum file. */ - rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext); + rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext); lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); - offs = XFS_SUMOFFS(mp, lenlog, rbmoff); + offs = xfs_rtsumoffs(mp, lenlog, rbmoff); - rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; - rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); + rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); - if (!xfs_verify_rtext(mp, rtbno, rtlen)) { + if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); return -EFSCORRUPTED; } @@ -147,9 +159,9 @@ xchk_rtsum_record_free( if (error) return error; - v++; + value = xchk_rtsum_inc(sc->mp, &v); trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount, - lenlog, offs, v); + lenlog, offs, value); return xfsum_store(sc, offs, v); } @@ -160,12 +172,11 @@ xchk_rtsum_compute( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - unsigned long long rtbmp_bytes; + unsigned long long rtbmp_blocks; /* If the bitmap size doesn't match the computed size, bail. */ - rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY); - if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) != - mp->m_rbmip->i_disk_size) + rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents); + if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size) return -EFSCORRUPTED; return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, @@ -177,14 +188,18 @@ STATIC int xchk_rtsum_compare( struct xfs_scrub *sc) { + struct xfs_rtalloc_args args = { + .mp = sc->mp, + .tp = sc->tp, + }; struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp; struct xfs_bmbt_irec map; xfs_fileoff_t off; xchk_rtsumoff_t sumoff = 0; int nmap; for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { + union xfs_suminfo_raw *ondisk_info; int error = 0; if (xchk_should_terminate(sc, &error)) @@ -205,22 +220,23 @@ xchk_rtsum_compare( } /* Read a block's worth of ondisk rtsummary file. */ - error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp); + error = xfs_rtsummary_read_buf(&args, off); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; /* Read a block's worth of computed rtsummary file. */ error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); if (error) { - xfs_trans_brelse(sc->tp, bp); + xfs_rtbuf_cache_relse(&args); return error; } - if (memcmp(bp->b_addr, sc->buf, + ondisk_info = xfs_rsumblock_infoptr(&args, 0); + if (memcmp(ondisk_info, sc->buf, mp->m_blockwsize << XFS_WORDLOG) != 0) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); - xfs_trans_brelse(sc->tp, bp); + xfs_rtbuf_cache_relse(&args); sumoff += mp->m_blockwsize; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 46249e7b17e0..29afa4851235 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -13,6 +13,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index cbd4d01e253c..4a8bc6f3c8f2 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1036,17 +1036,18 @@ TRACE_EVENT(xfarray_sort_stats, #ifdef CONFIG_XFS_RT TRACE_EVENT(xchk_rtsum_record_free, - TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start, - uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v), - TP_ARGS(mp, start, len, log, pos, v), + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, + xfs_rtbxlen_t len, unsigned int log, loff_t pos, + xfs_suminfo_t value), + TP_ARGS(mp, start, len, log, pos, value), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, rtdev) - __field(xfs_rtblock_t, start) + __field(xfs_rtxnum_t, start) __field(unsigned long long, len) __field(unsigned int, log) __field(loff_t, pos) - __field(xfs_suminfo_t, v) + __field(xfs_suminfo_t, value) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; @@ -1055,7 +1056,7 @@ TRACE_EVENT(xchk_rtsum_record_free, __entry->len = len; __entry->log = log; __entry->pos = pos; - __entry->v = v; + __entry->value = value; ), TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1064,7 +1065,7 @@ TRACE_EVENT(xchk_rtsum_record_free, __entry->len, __entry->log, __entry->pos, - __entry->v) + __entry->value) ); #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 40e0a1f1f753..731260a5af6d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -28,6 +28,7 @@ #include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_rtbitmap.h" /* Kernel only BMAP related definitions and functions */ @@ -75,28 +76,28 @@ xfs_bmap_rtalloc( { struct xfs_mount *mp = ap->ip->i_mount; xfs_fileoff_t orig_offset = ap->offset; - xfs_rtblock_t rtb; - xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_rtxnum_t rtx; + xfs_rtxlen_t prod = 0; /* product factor for allocators */ xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_rtxlen_t ralen = 0; /* realtime allocation length */ xfs_extlen_t align; /* minimum allocation alignment */ xfs_extlen_t orig_length = ap->length; xfs_extlen_t minlen = mp->m_sb.sb_rextsize; - xfs_extlen_t raminlen; + xfs_rtxlen_t raminlen; bool rtlocked = false; bool ignore_locality = false; int error; align = xfs_get_extsz_hint(ap->ip); retry: - prod = align / mp->m_sb.sb_rextsize; + prod = xfs_extlen_to_rtxlen(mp, align); error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, ap->conv, &ap->offset, &ap->length); if (error) return error; ASSERT(ap->length); - ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); + ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); /* * If we shifted the file offset downward to satisfy an extent size @@ -116,17 +117,14 @@ retry: prod = 1; /* * Set ralen to be the actual requested length in rtextents. - */ - ralen = ap->length / mp->m_sb.sb_rextsize; - /* + * * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that * we rounded up to it, cut it back so it's valid again. * Note that if it's a really large request (bigger than * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't * adjust the starting point to match it. */ - if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN) - ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize; + ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); /* * Lock out modifications to both the RT bitmap and summary inodes @@ -144,12 +142,10 @@ retry: * pick an extent that will space things out in the rt area. */ if (ap->eof && ap->offset == 0) { - xfs_rtblock_t rtx; /* realtime extent no */ - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; - ap->blkno = rtx * mp->m_sb.sb_rextsize; + ap->blkno = xfs_rtx_to_rtb(mp, rtx); } else { ap->blkno = 0; } @@ -160,20 +156,18 @@ retry: * Realtime allocation, done through xfs_rtallocate_extent. */ if (ignore_locality) - ap->blkno = 0; + rtx = 0; else - do_div(ap->blkno, mp->m_sb.sb_rextsize); - rtb = ap->blkno; - ap->length = ralen; - raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize); - error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length, - &ralen, ap->wasdel, prod, &rtb); + rtx = xfs_rtb_to_rtx(mp, ap->blkno); + raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, + ap->wasdel, prod, &rtx); if (error) return error; - if (rtb != NULLRTBLOCK) { - ap->blkno = rtb * mp->m_sb.sb_rextsize; - ap->length = ralen * mp->m_sb.sb_rextsize; + if (rtx != NULLRTEXTNO) { + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + ap->length = xfs_rtxlen_to_extlen(mp, ralen); ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -690,7 +684,7 @@ xfs_can_free_eofblocks( */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) - end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); + end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; @@ -780,12 +774,10 @@ xfs_alloc_file_space( { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; - xfs_filblks_t allocated_fsb; xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; - int nimaps; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; @@ -808,7 +800,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; - nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); allocatesize_fsb = endoffset_fsb - startoffset_fsb; @@ -819,6 +810,7 @@ xfs_alloc_file_space( while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; unsigned int dblocks, rblocks, resblks; + int nimaps = 1; /* * Determine space reservations for data/realtime. @@ -884,15 +876,19 @@ xfs_alloc_file_space( if (error) break; - allocated_fsb = imapp->br_blockcount; - - if (nimaps == 0) { - error = -ENOSPC; - break; + /* + * If the allocator cannot find a single free extent large + * enough to cover the start block of the requested range, + * xfs_bmapi_write will return 0 but leave *nimaps set to 0. + * + * In that case we simply need to keep looping with the same + * startoffset_fsb so that one of the following allocations + * will eventually reach the requested range. + */ + if (nimaps) { + startoffset_fsb += imapp->br_blockcount; + allocatesize_fsb -= imapp->br_blockcount; } - - startoffset_fsb += allocated_fsb; - allocatesize_fsb -= allocated_fsb; } return error; @@ -989,10 +985,8 @@ xfs_free_file_space( /* We can only free complete realtime extents. */ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { - startoffset_fsb = roundup_64(startoffset_fsb, - mp->m_sb.sb_rextsize); - endoffset_fsb = rounddown_64(endoffset_fsb, - mp->m_sb.sb_rextsize); + startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb); + endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 203700278ddb..e33e5e13b95f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -214,6 +214,43 @@ xfs_ilock_iocb( return 0; } +static int +xfs_ilock_iocb_for_write( + struct kiocb *iocb, + unsigned int *lock_mode) +{ + ssize_t ret; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + + ret = xfs_ilock_iocb(iocb, *lock_mode); + if (ret) + return ret; + + if (*lock_mode == XFS_IOLOCK_EXCL) + return 0; + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return 0; + + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); +} + +static unsigned int +xfs_ilock_for_write_fault( + struct xfs_inode *ip) +{ + /* get a shared lock if no remapping in progress */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return XFS_MMAPLOCK_SHARED; + + /* wait for remapping to complete */ + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + return XFS_MMAPLOCK_EXCL; +} + STATIC ssize_t xfs_file_dio_read( struct kiocb *iocb, @@ -551,7 +588,7 @@ xfs_file_dio_write_aligned( unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; ret = xfs_file_write_checks(iocb, from, &iolock); @@ -618,7 +655,7 @@ retry_exclusive: flags = IOMAP_DIO_FORCE_WAIT; } - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -1180,7 +1217,7 @@ xfs_file_remap_range( if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: - xfs_iunlock2_io_mmap(src, dest); + xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; @@ -1328,6 +1365,7 @@ __xfs_filemap_fault( struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; + unsigned int lock_mode = 0; trace_xfs_filemap_fault(ip, order, write_fault); @@ -1336,25 +1374,24 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } + if (IS_DAX(inode) || write_fault) + lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); + if (IS_DAX(inode)) { pfn_t pfn; - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = xfs_dax_fault(vmf, order, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, order, pfn); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else if (write_fault) { + ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); } else { - if (write_fault) { - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = iomap_page_mkwrite(vmf, - &xfs_page_mkwrite_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - } else { - ret = filemap_fault(vmf); - } + ret = filemap_fault(vmf); } + if (lock_mode) + xfs_iunlock(XFS_I(inode), lock_mode); + if (write_fault) sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 736e5545f584..5a72217f5feb 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -23,7 +23,7 @@ #include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_alloc_btree.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_ag.h" /* Convert an xfs_fsmap to an fsmap. */ @@ -483,11 +483,11 @@ xfs_getfsmap_rtdev_rtbitmap_helper( xfs_rtblock_t rtbno; xfs_daddr_t rec_daddr, len_daddr; - rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); rec_daddr = XFS_FSB_TO_BB(mp, rtbno); irec.rm_startblock = rtbno; - rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount); len_daddr = XFS_FSB_TO_BB(mp, rtbno); irec.rm_blockcount = rtbno; @@ -514,7 +514,7 @@ xfs_getfsmap_rtdev_rtbitmap( uint64_t eofs; int error; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize); + eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents)); if (keys[0].fmr_physical >= eofs) return 0; start_rtb = XFS_BB_TO_FSBT(mp, @@ -539,11 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap( * Set up query parameters to return free rtextents covering the range * we want. */ - alow.ar_startext = start_rtb; - ahigh.ar_startext = end_rtb; - do_div(alow.ar_startext, mp->m_sb.sb_rextsize); - if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) - ahigh.ar_startext++; + alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb); + ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb); error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 36f5cf802c07..c0f1c89786c2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -918,6 +918,13 @@ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) { + if (VFS_I(ip)->i_nlink == 0) { + xfs_alert(ip->i_mount, + "%s: Attempt to drop inode (%llu) with nlink zero.", + __func__, ip->i_ino); + return -EFSCORRUPTED; + } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); drop_nlink(VFS_I(ip)); @@ -3621,6 +3628,23 @@ xfs_iunlock2_io_mmap( inode_unlock(VFS_I(ip1)); } +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ +void +xfs_iunlock2_remapping( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + xfs_iflags_clear(ip1, XFS_IREMAPPING); + + if (ip1 != ip2) + xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + + if (ip1 != ip2) + inode_unlock_shared(VFS_I(ip1)); + inode_unlock(VFS_I(ip2)); +} + /* * Reload the incore inode list for this inode. Caller should ensure that * the link count cannot change, either by taking ILOCK_SHARED or otherwise diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0c5bdb91152e..3dc47937da5d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) /* Quotacheck is running but inode has not been added to quota counts. */ #define XFS_IQUOTAUNCHECKED (1 << 14) +/* + * Remap in progress. Callers that wish to update file data while + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake + * the lock in exclusive mode. Relocking the file will block until + * IREMAPPING is cleared. + */ +#define XFS_IREMAPPING (1U << 15) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); static inline bool xfs_inode_unlinked_incomplete( diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 17c51804f9c6..cd7803fda8b1 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_error.h" +#include "xfs_rtbitmap.h" #include <linux/iversion.h> @@ -107,7 +108,7 @@ xfs_inode_item_precommit( */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT); ip->i_extsize = 0; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 55bb01173cde..a82470e027f7 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -38,6 +38,7 @@ #include "xfs_reflink.h" #include "xfs_ioctl.h" #include "xfs_xattr.h" +#include "xfs_rtbitmap.h" #include <linux/mount.h> #include <linux/namei.h> @@ -1004,7 +1005,7 @@ xfs_fill_fsxattr( * later. */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - ip->i_extsize % mp->m_sb.sb_rextsize > 0) { + xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) { fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); fa->fsx_extsize = 0; @@ -1130,7 +1131,7 @@ xfs_ioctl_setattr_xflags( /* If realtime flag is set then must have realtime device */ if (fa->fsx_xflags & FS_XFLAG_REALTIME) { if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || - (ip->i_extsize % mp->m_sb.sb_rextsize)) + xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e9d317a3dafe..d7873e0360f0 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -198,6 +198,18 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) return x; } +/* If @b is a power of 2, return log2(b). Else return -1. */ +static inline int8_t log2_if_power2(unsigned long b) +{ + return is_power_of_2(b) ? ilog2(b) : -1; +} + +/* If @b is a power of 2, return a mask of the lower bits, else return zero. */ +static inline unsigned long long mask64_if_power2(unsigned long b) +{ + return is_power_of_2(b) ? b - 1 : 0; +} + int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, enum req_op op); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 219681d29fbc..503fe3c7edbf 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -101,9 +101,9 @@ typedef struct xfs_mount { /* * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] <= the minimum i for which - * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip - * inode lock. + * invariant that m_rsum_cache[bbno] > the maximum i for which + * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i. + * Reads and writes are serialized by the rsumip inode lock. */ uint8_t *m_rsum_cache; struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ @@ -119,6 +119,7 @@ typedef struct xfs_mount { uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + int8_t m_rtxblklog; /* log2 of rextsize, if possible */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -152,6 +153,7 @@ typedef struct xfs_mount { uint64_t m_features; /* active filesystem features */ uint64_t m_low_space[XFS_LOWSP_MAX]; uint64_t m_low_rtexts[XFS_LOWSP_MAX]; + uint64_t m_rtxblkmask; /* rt extent block mask */ struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index c4cc99b70dd3..21a7e350b4c5 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -72,6 +72,10 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + /* realtime structures */ + XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); + XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); + /* * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to * 4 bytes anyway so it's not obviously a problem. Hence for the moment diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index eb9102453aff..658edee8381d 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1540,6 +1540,10 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; + xfs_iflags_set(src, XFS_IREMAPPING); + if (inode_in != inode_out) + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + return 0; out_unlock: xfs_iunlock2_io_mmap(src, dest); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 2e1a4e5cd03d..88c48de5c9c8 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -19,6 +19,7 @@ #include "xfs_icache.h" #include "xfs_rtalloc.h" #include "xfs_sb.h" +#include "xfs_rtbitmap.h" /* * Read and return the summary information for a given extent size, @@ -28,48 +29,48 @@ */ static int xfs_rtget_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { - return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum); + return xfs_rtmodify_summary_int(args, log, bbno, 0, sum); } /* * Return whether there are any free extents in the size range given * by low and high, for the bitmap block bbno. */ -STATIC int /* error */ +STATIC int xfs_rtany_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int low, /* low log2 extent size */ - int high, /* high log2 extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - int *stat) /* out: any good extents here? */ + struct xfs_rtalloc_args *args, + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int *maxlog) /* out: max log2 extent size free */ { - int error; /* error value */ - int log; /* loop counter, log2 of ext. size */ - xfs_suminfo_t sum; /* summary data */ - - /* There are no extents at levels < m_rsum_cache[bbno]. */ - if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno]) - low = mp->m_rsum_cache[bbno]; + struct xfs_mount *mp = args->mp; + int error; + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* There are no extents at levels >= m_rsum_cache[bbno]. */ + if (mp->m_rsum_cache) { + high = min(high, mp->m_rsum_cache[bbno] - 1); + if (low > high) { + *maxlog = -1; + return 0; + } + } /* * Loop over logs of extent sizes. */ - for (log = low; log <= high; log++) { + for (log = high; log >= low; log--) { /* * Get one summary datum. */ - error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + error = xfs_rtget_summary(args, log, bbno, &sum); if (error) { return error; } @@ -77,18 +78,18 @@ xfs_rtany_summary( * If there are any, return success. */ if (sum) { - *stat = 1; + *maxlog = log; goto out; } } /* * Found nothing, return failure. */ - *stat = 0; + *maxlog = -1; out: - /* There were no extents at levels < log. */ - if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; + /* There were no extents at levels > log. */ + if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; return 0; } @@ -97,60 +98,54 @@ out: * Copy and transform the summary file, given the old and new * parameters in the mount structures. */ -STATIC int /* error */ +STATIC int xfs_rtcopy_summary( - xfs_mount_t *omp, /* old file system mount point */ - xfs_mount_t *nmp, /* new file system mount point */ - xfs_trans_t *tp) /* transaction pointer */ + struct xfs_rtalloc_args *oargs, + struct xfs_rtalloc_args *nargs) { - xfs_rtblock_t bbno; /* bitmap block number */ - struct xfs_buf *bp; /* summary buffer */ - int error; /* error return value */ - int log; /* summary level number (log length) */ - xfs_suminfo_t sum; /* summary data */ - xfs_fsblock_t sumbno; /* summary block number */ + xfs_fileoff_t bbno; /* bitmap block number */ + int error; + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ - bp = NULL; - for (log = omp->m_rsumlevels - 1; log >= 0; log--) { - for (bbno = omp->m_sb.sb_rbmblocks - 1; + for (log = oargs->mp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = oargs->mp->m_sb.sb_rbmblocks - 1; (xfs_srtblock_t)bbno >= 0; bbno--) { - error = xfs_rtget_summary(omp, tp, log, bbno, &bp, - &sumbno, &sum); + error = xfs_rtget_summary(oargs, log, bbno, &sum); if (error) - return error; + goto out; if (sum == 0) continue; - error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, - &bp, &sumbno); + error = xfs_rtmodify_summary(oargs, log, bbno, -sum); if (error) - return error; - error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, - &bp, &sumbno); + goto out; + error = xfs_rtmodify_summary(nargs, log, bbno, sum); if (error) - return error; + goto out; ASSERT(sum > 0); } } + error = 0; +out: + xfs_rtbuf_cache_relse(oargs); return 0; } /* * Mark an extent specified by start and len allocated. * Updates all the summary information as well as the bitmap. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* start block to allocate */ - xfs_extlen_t len, /* length to allocate */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* start rtext to allocate */ + xfs_rtxlen_t len) /* in/out: summary block number */ { - xfs_rtblock_t end; /* end of the allocated extent */ - int error; /* error value */ - xfs_rtblock_t postblock = 0; /* first block allocated > end */ - xfs_rtblock_t preblock = 0; /* first block allocated < start */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t end; /* end of the allocated rtext */ + int error; + xfs_rtxnum_t postblock = 0; /* first rtext allocated > end */ + xfs_rtxnum_t preblock = 0; /* first rtext allocated < start */ end = start + len - 1; /* @@ -158,15 +153,15 @@ xfs_rtallocate_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + error = xfs_rtfind_back(args, start, 0, &preblock); if (error) { return error; } /* * Find the next allocated block (end of free extent). */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); + error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + &postblock); if (error) { return error; } @@ -174,9 +169,9 @@ xfs_rtallocate_range( * Decrement the summary information corresponding to the entire * (old) free extent. */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; } @@ -185,9 +180,9 @@ xfs_rtallocate_range( * old extent, add summary data for them to be free. */ if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(start - preblock), + xfs_rtx_to_rbmblock(mp, preblock), 1); if (error) { return error; } @@ -197,9 +192,9 @@ xfs_rtallocate_range( * old extent, add summary data for them to be free. */ if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock - end), + xfs_rtx_to_rbmblock(mp, end + 1), 1); if (error) { return error; } @@ -207,54 +202,69 @@ xfs_rtallocate_range( /* * Modify the bitmap to mark this extent allocated. */ - error = xfs_rtmodify_range(mp, tp, start, len, 0); + error = xfs_rtmodify_range(args, start, len, 0); return error; } /* + * Make sure we don't run off the end of the rt volume. Be careful that + * adjusting maxlen downwards doesn't cause us to fail the alignment checks. + */ +static inline xfs_rtxlen_t +xfs_rtallocate_clamp_len( + struct xfs_mount *mp, + xfs_rtxnum_t startrtx, + xfs_rtxlen_t rtxlen, + xfs_rtxlen_t prod) +{ + xfs_rtxlen_t ret; + + ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx; + return rounddown(ret, prod); +} + +/* * Attempt to allocate an extent minlen<=len<=maxlen starting from * bitmap block bbno. If we don't get maxlen then use prod to trim - * the length, if given. Returns error; returns starting block in *rtblock. + * the length, if given. Returns error; returns starting block in *rtx. * The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_block( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - xfs_rtblock_t *nextp, /* out: next block to try */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_fileoff_t bbno, /* bitmap block number */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t *nextp, /* out: next rtext to try */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - xfs_rtblock_t besti; /* best rtblock found so far */ - xfs_rtblock_t bestlen; /* best length found so far */ - xfs_rtblock_t end; /* last rtblock in chunk */ - int error; /* error value */ - xfs_rtblock_t i; /* current rtblock trying */ - xfs_rtblock_t next; /* next rtblock to try */ - int stat; /* status from internal calls */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t besti; /* best rtext found so far */ + xfs_rtxnum_t bestlen;/* best length found so far */ + xfs_rtxnum_t end; /* last rtext in chunk */ + int error; + xfs_rtxnum_t i; /* current rtext trying */ + xfs_rtxnum_t next; /* next rtext to try */ + int stat; /* status from internal calls */ /* * Loop over all the extents starting in this bitmap block, * looking for one that's long enough. */ - for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0, - end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; + for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0, + end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1; i <= end; i++) { /* Make sure we don't scan off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. */ - error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat); + error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); if (error) { return error; } @@ -262,13 +272,12 @@ xfs_rtallocate_extent_block( /* * i for maxlen is all free, allocate and return that. */ - error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp, - rsb); + error = xfs_rtallocate_range(args, i, maxlen); if (error) { return error; } *len = maxlen; - *rtblock = i; + *rtx = i; return 0; } /* @@ -278,7 +287,7 @@ xfs_rtallocate_extent_block( * so far, remember it. */ if (minlen < maxlen) { - xfs_rtblock_t thislen; /* this extent size */ + xfs_rtxnum_t thislen; /* this extent size */ thislen = next - i; if (thislen >= minlen && thislen > bestlen) { @@ -290,7 +299,7 @@ xfs_rtallocate_extent_block( * If not done yet, find the start of the next free space. */ if (next < end) { - error = xfs_rtfind_forw(mp, tp, next, end, &i); + error = xfs_rtfind_forw(args, next, end, &i); if (error) { return error; } @@ -301,7 +310,7 @@ xfs_rtallocate_extent_block( * Searched the whole thing & didn't find a maxlen free extent. */ if (minlen < maxlen && besti != -1) { - xfs_extlen_t p; /* amount to trim length by */ + xfs_rtxlen_t p; /* amount to trim length by */ /* * If size should be a multiple of prod, make that so. @@ -315,51 +324,49 @@ xfs_rtallocate_extent_block( /* * Allocate besti for bestlen & return that. */ - error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb); + error = xfs_rtallocate_range(args, besti, bestlen); if (error) { return error; } *len = bestlen; - *rtblock = besti; + *rtx = besti; return 0; } /* * Allocation failed. Set *nextp to the next block to try. */ *nextp = next; - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } /* * Allocate an extent of length minlen<=len<=maxlen, starting at block * bno. If we don't get maxlen then use prod to trim the length, if given. - * Returns error; returns starting block in *rtblock. + * Returns error; returns starting block in *rtx. * The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_exact( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int error; /* error value */ - xfs_extlen_t i; /* extent length trimmed due to prod */ - int isfree; /* extent is free */ - xfs_rtblock_t next; /* next block to try (dummy) */ + int error; + xfs_rtxlen_t i; /* extent length trimmed due to prod */ + int isfree; /* extent is free */ + xfs_rtxnum_t next; /* next rtext to try (dummy) */ - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); /* * Check if the range in question (for maxlen) is free. */ - error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree); + error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree); if (error) { return error; } @@ -367,23 +374,23 @@ xfs_rtallocate_extent_exact( /* * If it is, allocate it and return success. */ - error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + error = xfs_rtallocate_range(args, start, maxlen); if (error) { return error; } *len = maxlen; - *rtblock = bno; + *rtx = start; return 0; } /* * If not, allocate what there is, if it's at least minlen. */ - maxlen = next - bno; + maxlen = next - start; if (maxlen < minlen) { /* * Failed, return failure status. */ - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } /* @@ -395,81 +402,82 @@ xfs_rtallocate_extent_exact( /* * Now we can't do it, return failure status. */ - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } } /* * Allocate what we can and return it. */ - error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + error = xfs_rtallocate_range(args, start, maxlen); if (error) { return error; } *len = maxlen; - *rtblock = bno; + *rtx = start; return 0; } /* * Allocate an extent of length minlen<=len<=maxlen, starting as near - * to bno as possible. If we don't get maxlen then use prod to trim + * to start as possible. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_near( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int any; /* any useful extents from summary */ - xfs_rtblock_t bbno; /* bitmap block number */ - int error; /* error value */ - int i; /* bitmap block offset (loop control) */ - int j; /* secondary loop control */ - int log2len; /* log2 of minlen */ - xfs_rtblock_t n; /* next block to try */ - xfs_rtblock_t r; /* result block */ - - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + struct xfs_mount *mp = args->mp; + int maxlog; /* max useful extent from summary */ + xfs_fileoff_t bbno; /* bitmap block number */ + int error; + int i; /* bitmap block offset (loop control) */ + int j; /* secondary loop control */ + int log2len; /* log2 of minlen */ + xfs_rtxnum_t n; /* next rtext to try */ + xfs_rtxnum_t r; /* result rtext */ + + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); + /* * If the block number given is off the end, silently set it to * the last block. */ - if (bno >= mp->m_sb.sb_rextents) - bno = mp->m_sb.sb_rextents - 1; + if (start >= mp->m_sb.sb_rextents) + start = mp->m_sb.sb_rextents - 1; /* Make sure we don't run off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); if (maxlen < minlen) { - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } /* * Try the exact allocation first. */ - error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len, - rbpp, rsb, prod, &r); + error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len, + prod, &r); if (error) { return error; } /* * If the exact allocation worked, return that. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } - bbno = XFS_BITTOBLOCK(mp, bno); + bbno = xfs_rtx_to_rbmblock(mp, start); i = 0; + j = -1; ASSERT(minlen != 0); log2len = xfs_highbit32(minlen); /* @@ -480,8 +488,8 @@ xfs_rtallocate_extent_near( * Get summary information of extents of all useful levels * starting in this bitmap block. */ - error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1, - bbno + i, rbpp, rsb, &any); + error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1, + bbno + i, &maxlog); if (error) { return error; } @@ -489,7 +497,10 @@ xfs_rtallocate_extent_near( * If there are any useful extents starting here, try * allocating one. */ - if (any) { + if (maxlog >= 0) { + xfs_extlen_t maxavail = + min_t(xfs_rtblock_t, maxlen, + (1ULL << (maxlog + 1)) - 1); /* * On the positive side of the starting location. */ @@ -498,17 +509,17 @@ xfs_rtallocate_extent_near( * Try to allocate an extent starting in * this block. */ - error = xfs_rtallocate_extent_block(mp, tp, - bbno + i, minlen, maxlen, len, &n, rbpp, - rsb, prod, &r); + error = xfs_rtallocate_extent_block(args, + bbno + i, minlen, maxavail, len, + &n, prod, &r); if (error) { return error; } /* * If it worked, return it. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } } @@ -516,68 +527,46 @@ xfs_rtallocate_extent_near( * On the negative side of the starting location. */ else { /* i < 0 */ + int maxblocks; + /* - * Loop backwards through the bitmap blocks from - * the starting point-1 up to where we are now. - * There should be an extent which ends in this - * bitmap block and is long enough. + * Loop backwards to find the end of the extent + * we found in the realtime summary. + * + * maxblocks is the maximum possible number of + * bitmap blocks from the start of the extent + * to the end of the extent. */ - for (j = -1; j > i; j--) { - /* - * Grab the summary information for - * this bitmap block. - */ - error = xfs_rtany_summary(mp, tp, - log2len, mp->m_rsumlevels - 1, - bbno + j, rbpp, rsb, &any); - if (error) { - return error; - } - /* - * If there's no extent given in the - * summary that means the extent we - * found must carry over from an - * earlier block. If there is an - * extent given, we've already tried - * that allocation, don't do it again. - */ - if (any) - continue; - error = xfs_rtallocate_extent_block(mp, - tp, bbno + j, minlen, maxlen, - len, &n, rbpp, rsb, prod, &r); + if (maxlog == 0) + maxblocks = 0; + else if (maxlog < mp->m_blkbit_log) + maxblocks = 1; + else + maxblocks = 2 << (maxlog - mp->m_blkbit_log); + + /* + * We need to check bbno + i + maxblocks down to + * bbno + i. We already checked bbno down to + * bbno + j + 1, so we don't need to check those + * again. + */ + j = min(i + maxblocks, j); + for (; j >= i; j--) { + error = xfs_rtallocate_extent_block(args, + bbno + j, minlen, + maxavail, len, &n, prod, + &r); if (error) { return error; } /* * If it works, return the extent. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } } - /* - * There weren't intervening bitmap blocks - * with a long enough extent, or the - * allocation didn't work for some reason - * (i.e. it's a little * too short). - * Try to allocate from the summary block - * that we found. - */ - error = xfs_rtallocate_extent_block(mp, tp, - bbno + i, minlen, maxlen, len, &n, rbpp, - rsb, prod, &r); - if (error) { - return error; - } - /* - * If it works, return the extent. - */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; - } } } /* @@ -610,7 +599,7 @@ xfs_rtallocate_extent_near( else break; } - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } @@ -619,26 +608,25 @@ xfs_rtallocate_extent_near( * specified. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_size( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int error; /* error value */ - int i; /* bitmap block number */ - int l; /* level number (loop control) */ - xfs_rtblock_t n; /* next block to be tried */ - xfs_rtblock_t r; /* result block number */ - xfs_suminfo_t sum; /* summary information for extents */ - - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + struct xfs_mount *mp = args->mp; + int error; + xfs_fileoff_t i; /* bitmap block number */ + int l; /* level number (loop control) */ + xfs_rtxnum_t n; /* next rtext to be tried */ + xfs_rtxnum_t r; /* result rtext number */ + xfs_suminfo_t sum; /* summary information for extents */ + + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); ASSERT(maxlen != 0); /* @@ -656,8 +644,7 @@ xfs_rtallocate_extent_size( /* * Get the summary for this level/block. */ - error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, - &sum); + error = xfs_rtget_summary(args, l, i, &sum); if (error) { return error; } @@ -669,16 +656,16 @@ xfs_rtallocate_extent_size( /* * Try allocating the extent. */ - error = xfs_rtallocate_extent_block(mp, tp, i, maxlen, - maxlen, len, &n, rbpp, rsb, prod, &r); + error = xfs_rtallocate_extent_block(args, i, maxlen, + maxlen, len, &n, prod, &r); if (error) { return error; } /* * If it worked, return that. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } /* @@ -686,8 +673,8 @@ xfs_rtallocate_extent_size( * allocator is beyond the next bitmap block, * skip to that bitmap block. */ - if (XFS_BITTOBLOCK(mp, n) > i + 1) - i = XFS_BITTOBLOCK(mp, n) - 1; + if (xfs_rtx_to_rbmblock(mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(mp, n) - 1; } } /* @@ -695,7 +682,7 @@ xfs_rtallocate_extent_size( * we're asking for a fixed size extent. */ if (minlen > --maxlen) { - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } ASSERT(minlen != 0); @@ -715,8 +702,7 @@ xfs_rtallocate_extent_size( /* * Get the summary information for this level/block. */ - error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, - &sum); + error = xfs_rtget_summary(args, l, i, &sum); if (error) { return error; } @@ -730,18 +716,18 @@ xfs_rtallocate_extent_size( * minlen/maxlen are in the possible range for * this summary level. */ - error = xfs_rtallocate_extent_block(mp, tp, i, + error = xfs_rtallocate_extent_block(args, i, XFS_RTMAX(minlen, 1 << l), XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), - len, &n, rbpp, rsb, prod, &r); + len, &n, prod, &r); if (error) { return error; } /* * If it worked, return that extent. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } /* @@ -749,14 +735,14 @@ xfs_rtallocate_extent_size( * allocator is beyond the next bitmap block, * skip to that bitmap block. */ - if (XFS_BITTOBLOCK(mp, n) > i + 1) - i = XFS_BITTOBLOCK(mp, n) - 1; + if (xfs_rtx_to_rbmblock(mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(mp, n) - 1; } } /* * Got nothing, return failure. */ - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } @@ -886,12 +872,14 @@ xfs_alloc_rsum_cache( xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */ { /* - * The rsum cache is initialized to all zeroes, which is trivially a - * lower bound on the minimum level with any free extents. We can - * continue without the cache if it couldn't be allocated. + * The rsum cache is initialized to the maximum value, which is + * trivially an upper bound on the maximum level with any free extents. + * We can continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL); - if (!mp->m_rsum_cache) + mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); + if (mp->m_rsum_cache) + memset(mp->m_rsum_cache, -1, rbmblocks); + else xfs_warn(mp, "could not allocate realtime summary cache"); } @@ -907,13 +895,13 @@ xfs_growfs_rt( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_rt_t *in) /* growfs rt input struct */ { - xfs_rtblock_t bmbno; /* bitmap block number */ + xfs_fileoff_t bmbno; /* bitmap block number */ struct xfs_buf *bp; /* temporary buffer */ int error; /* error return value */ xfs_mount_t *nmp; /* new (fake) mount structure */ xfs_rfsblock_t nrblocks; /* new number of realtime blocks */ xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ - xfs_rtblock_t nrextents; /* new number of realtime extents */ + xfs_rtxnum_t nrextents; /* new number of realtime extents */ uint8_t nrextslog; /* new log2 of sb_rextents */ xfs_extlen_t nrsumblocks; /* new number of summary blocks */ uint nrsumlevels; /* new rt summary levels */ @@ -922,7 +910,6 @@ xfs_growfs_rt( xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ - xfs_fsblock_t sumbno; /* summary block number */ uint8_t *rsum_cache; /* old summary cache */ sbp = &mp->m_sb; @@ -954,7 +941,7 @@ xfs_growfs_rt( return -EINVAL; /* Unsupported realtime features. */ - if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) + if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) return -EOPNOTSUPP; nrblocks = in->newblocks; @@ -976,11 +963,10 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); - nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); + nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); nrextslog = xfs_highbit32(nrextents); nrsumlevels = nrextslog + 1; - nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; - nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); /* * New summary size can't be more than half the size of @@ -1023,6 +1009,12 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { + struct xfs_rtalloc_args args = { + .mp = mp, + }; + struct xfs_rtalloc_args nargs = { + .mp = nmp, + }; struct xfs_trans *tp; xfs_rfsblock_t nrblocks_step; @@ -1032,19 +1024,17 @@ xfs_growfs_rt( * Calculate new sb and mount fields for this round. */ nsbp->sb_rextsize = in->extsize; + nmp->m_rtxblklog = -1; /* don't use shift or masking */ nsbp->sb_rbmblocks = bmbno + 1; nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize * nsbp->sb_rextsize; nsbp->sb_rblocks = min(nrblocks, nrblocks_step); - nsbp->sb_rextents = nsbp->sb_rblocks; - do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); ASSERT(nsbp->sb_rextents != 0); nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; - nrsumsize = - (uint)sizeof(xfs_suminfo_t) * nrsumlevels * - nsbp->sb_rbmblocks; - nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, + nsbp->sb_rbmblocks); nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); /* * Start a transaction, get the log reservation. @@ -1053,6 +1043,9 @@ xfs_growfs_rt( &tp); if (error) break; + args.tp = tp; + nargs.tp = tp; + /* * Lock out other callers by grabbing the bitmap inode lock. */ @@ -1086,7 +1079,7 @@ xfs_growfs_rt( */ if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || mp->m_rsumlevels != nmp->m_rsumlevels) { - error = xfs_rtcopy_summary(mp, nmp, tp); + error = xfs_rtcopy_summary(&args, &nargs); if (error) goto error_cancel; } @@ -1111,9 +1104,9 @@ xfs_growfs_rt( /* * Free new extent. */ - bp = NULL; - error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, - nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); + error = xfs_rtfree_range(&nargs, sbp->sb_rextents, + nsbp->sb_rextents - sbp->sb_rextents); + xfs_rtbuf_cache_relse(&nargs); if (error) { error_cancel: xfs_trans_cancel(tp); @@ -1171,59 +1164,60 @@ out_free: * parameters. The length units are all in realtime extents, as is the * result block number. */ -int /* error */ +int xfs_rtallocate_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_trans *tp, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + int wasdel, /* was a delayed allocation extent */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtblock) /* out: start rtext allocated */ { - xfs_mount_t *mp = tp->t_mountp; - int error; /* error value */ - xfs_rtblock_t r; /* result allocated block */ - xfs_fsblock_t sb; /* summary file block number */ - struct xfs_buf *sumbp; /* summary file block buffer */ - - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + struct xfs_rtalloc_args args = { + .mp = tp->t_mountp, + .tp = tp, + }; + int error; /* error value */ + xfs_rtxnum_t r; /* result allocated rtext */ + + ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL)); ASSERT(minlen > 0 && minlen <= maxlen); /* * If prod is set then figure out what to do to minlen and maxlen. */ if (prod > 1) { - xfs_extlen_t i; + xfs_rtxlen_t i; if ((i = maxlen % prod)) maxlen -= i; if ((i = minlen % prod)) minlen += prod - i; if (maxlen < minlen) { - *rtblock = NULLRTBLOCK; + *rtblock = NULLRTEXTNO; return 0; } } retry: - sumbp = NULL; - if (bno == 0) { - error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, - &sumbp, &sb, prod, &r); + if (start == 0) { + error = xfs_rtallocate_extent_size(&args, minlen, + maxlen, len, prod, &r); } else { - error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, - len, &sumbp, &sb, prod, &r); + error = xfs_rtallocate_extent_near(&args, start, minlen, + maxlen, len, prod, &r); } + xfs_rtbuf_cache_relse(&args); if (error) return error; /* * If it worked, update the superblock. */ - if (r != NULLRTBLOCK) { + if (r != NULLRTEXTNO) { long slen = (long)*len; ASSERT(*len >= minlen && *len <= maxlen); @@ -1250,6 +1244,7 @@ xfs_rtmount_init( struct xfs_buf *bp; /* buffer for last block of subvolume */ struct xfs_sb *sbp; /* filesystem superblock copy in mount */ xfs_daddr_t d; /* address of last block of subvolume */ + unsigned int rsumblocks; int error; sbp = &mp->m_sb; @@ -1261,10 +1256,9 @@ xfs_rtmount_init( return -ENODEV; } mp->m_rsumlevels = sbp->sb_rextslog + 1; - mp->m_rsumsize = - (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels * - sbp->sb_rbmblocks; - mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize); + rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels, + mp->m_sb.sb_rbmblocks); + mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks); mp->m_rbmip = mp->m_rsumip = NULL; /* * Check that the realtime section is an ok size. @@ -1418,27 +1412,27 @@ xfs_rtunmount_inodes( * of rtextents and the fraction. * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... */ -int /* error */ +int /* error */ xfs_rtpick_extent( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick) /* result rt extent */ - { - xfs_rtblock_t b; /* result block */ + xfs_rtxlen_t len, /* allocation length (rtextents) */ + xfs_rtxnum_t *pick) /* result rt extent */ +{ + xfs_rtxnum_t b; /* result rtext */ int log2; /* log of sequence number */ uint64_t resid; /* residual after log removed */ uint64_t seq; /* sequence number of file creation */ - struct timespec64 ts; /* temporary timespec64 storage */ + struct timespec64 ts; /* timespec in inode */ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + ts = inode_get_atime(VFS_I(mp->m_rbmip)); if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; seq = 0; } else { - ts = inode_get_atime(VFS_I(mp->m_rbmip)); - seq = (uint64_t)ts.tv_sec; + seq = ts.tv_sec; } if ((log2 = xfs_highbit64(seq)) == -1) b = 0; @@ -1451,7 +1445,7 @@ xfs_rtpick_extent( if (b + len > mp->m_sb.sb_rextents) b = mp->m_sb.sb_rextents - len; } - ts.tv_sec = (time64_t)seq + 1; + ts.tv_sec = seq + 1; inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); *pick = b; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 62c7ad79cbb6..f7cb9ffe51ca 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -11,22 +11,6 @@ struct xfs_mount; struct xfs_trans; -/* - * XXX: Most of the realtime allocation functions deal in units of realtime - * extents, not realtime blocks. This looks funny when paired with the type - * name and screams for a larger cleanup. - */ -struct xfs_rtalloc_rec { - xfs_rtblock_t ar_startext; - xfs_rtblock_t ar_extcount; -}; - -typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_mount *mp, - struct xfs_trans *tp, - const struct xfs_rtalloc_rec *rec, - void *priv); - #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -40,23 +24,14 @@ typedef int (*xfs_rtalloc_query_range_fn)( int /* error */ xfs_rtallocate_extent( struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ int wasdel, /* was a delayed allocation extent */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock); /* out: start block allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtblock); /* out: start rtext allocated */ -/* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len); /* length of extent freed */ /* * Initialize realtime fields in the mount structure. @@ -87,8 +62,8 @@ int /* error */ xfs_rtpick_extent( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick); /* result rt extent */ + xfs_rtxlen_t len, /* allocation length (rtextents) */ + xfs_rtxnum_t *pick); /* result rt extent */ /* * Grow the realtime area of the filesystem. @@ -98,55 +73,12 @@ xfs_growfs_rt( struct xfs_mount *mp, /* file system mount structure */ xfs_growfs_rt_t *in); /* user supplied growfs struct */ -/* - * From xfs_rtbitmap.c - */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); -int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, int val, - xfs_rtblock_t *new, int *stat); -int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_rtblock_t limit, - xfs_rtblock_t *rtblock); -int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_rtblock_t limit, - xfs_rtblock_t *rtblock); -int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, int val); -int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp, - int log, xfs_rtblock_t bbno, int delta, - struct xfs_buf **rbpp, xfs_fsblock_t *rsb, - xfs_suminfo_t *sum); -int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, - xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp, - xfs_fsblock_t *rsb); -int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, - struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, - xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtalloc_query_range_fn fn, - void *priv); -bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); -int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, - bool *is_free); int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else -# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) -# define xfs_rtfree_extent(t,b,l) (ENOSYS) -# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) -# define xfs_growfs_rt(mp,in) (ENOSYS) -# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) -# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) -# define xfs_verify_rtbno(m, r) (false) -# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) -# define xfs_rtalloc_reinit_frextents(m) (0) +# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS) +# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS) +# define xfs_growfs_rt(mp,in) (-ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ @@ -157,7 +89,7 @@ xfs_rtmount_init( xfs_warn(mp, "Not built with CONFIG_XFS_RT"); return -ENOSYS; } -# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f0ae07828153..764304595e8b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -42,6 +42,7 @@ #include "xfs_xattr.h" #include "xfs_iunlink_item.h" #include "xfs_dahash_test.h" +#include "xfs_rtbitmap.h" #include "scrub/stats.h" #include <linux/magic.h> @@ -896,7 +897,7 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_rblocks; freertx = percpu_counter_sum_positive(&mp->m_frextents); - statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; + statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx); } return 0; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8c0bfc9a33b1..305c9d07bf1b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -24,6 +24,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_icache.h" +#include "xfs_rtbitmap.h" struct kmem_cache *xfs_trans_cache; @@ -655,6 +656,10 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + if (tp->t_rextsize_delta) { + mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize); + } mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; mp->m_sb.sb_rblocks += tp->t_rblocks_delta; mp->m_sb.sb_rextents += tp->t_rextents_delta; @@ -1196,7 +1201,7 @@ xfs_trans_alloc_inode( retry: error = xfs_trans_alloc(mp, resv, dblocks, - rblocks / mp->m_sb.sb_rextsize, + xfs_extlen_to_rtxlen(mp, rblocks), force ? XFS_TRANS_RESERVE : 0, &tp); if (error) return error; |