diff options
Diffstat (limited to 'fs/bcachefs/fs.c')
-rw-r--r-- | fs/bcachefs/fs.c | 427 |
1 files changed, 277 insertions, 150 deletions
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 011817afc3ad..4a1bb07a2574 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -108,7 +108,7 @@ retry: goto retry; bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "%s: inode %u:%llu not found when updating", + "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); @@ -152,50 +152,106 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -static int bch2_iget5_test(struct inode *vinode, void *p) +static bool subvol_inum_eq(subvol_inum a, subvol_inum b) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; - - return inode->ei_subvol == inum->subvol && - inode->ei_inode.bi_inum == inum->inum; + return a.subvol == b.subvol && a.inum == b.inum; } -static int bch2_iget5_set(struct inode *vinode, void *p) +static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; + const struct bch_inode_info *inode = obj; + const subvol_inum *v = arg->key; - inode->v.i_ino = inum->inum; - inode->ei_subvol = inum->subvol; - inode->ei_inode.bi_inum = inum->inum; - return 0; + return !subvol_inum_eq(inode->ei_inum, *v); } -static unsigned bch2_inode_hash(subvol_inum inum) +static const struct rhashtable_params bch2_vfs_inodes_params = { + .head_offset = offsetof(struct bch_inode_info, hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum), + .key_len = sizeof(subvol_inum), + .obj_cmpfn = bch2_vfs_inode_cmp_fn, + .automatic_shrinking = true, +}; + +static void __wait_on_freeing_inode(struct inode *inode) { - return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); } struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { - return to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); +} + +static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, + subvol_inum inum) +{ + struct bch_inode_info *inode; +repeat: + inode = __bch2_inode_hash_find(c, inum); + if (inode) { + spin_lock(&inode->v.i_lock); + if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { + spin_unlock(&inode->v.i_lock); + return NULL; + } + if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { + if (!trans) { + __wait_on_freeing_inode(&inode->v); + } else { + bch2_trans_unlock(trans); + __wait_on_freeing_inode(&inode->v); + int ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + } + goto repeat; + } + __iget(&inode->v); + spin_unlock(&inode->v.i_lock); + } + + return inode; +} + +static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) +{ + spin_lock(&inode->v.i_lock); + bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); + spin_unlock(&inode->v.i_lock); + + if (remove) { + int ret = rhashtable_remove_fast(&c->vfs_inodes_table, + &inode->hash, bch2_vfs_inodes_params); + BUG_ON(ret); + inode->v.i_hash.pprev = NULL; + } } -static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) +static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, + struct btree_trans *trans, + struct bch_inode_info *inode) { - subvol_inum inum = inode_inum(inode); - struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); + struct bch_inode_info *old = inode; + + set_bit(EI_INODE_HASHED, &inode->ei_flags); +retry: + if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + &inode->hash, + bch2_vfs_inodes_params))) { + old = bch2_inode_hash_find(c, trans, inode->ei_inum); + if (!old) + goto retry; + + clear_bit(EI_INODE_HASHED, &inode->ei_flags); - if (unlikely(old != inode)) { /* * bcachefs doesn't use I_NEW; we have no use for it since we * only insert fully created inodes in the inode hash table. But @@ -209,21 +265,17 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino */ set_nlink(&inode->v, 1); discard_new_inode(&inode->v); - inode = old; + return old; } else { + inode_fake_hash(&inode->v); + + inode_sb_list_add(&inode->v); + mutex_lock(&c->vfs_inodes_lock); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); - /* - * Again, I_NEW makes no sense for bcachefs. This is only needed - * for clearing I_NEW, but since the inode was already fully - * created and initialized we didn't actually want - * inode_insert5() to set it for us. - */ - unlock_new_inode(&inode->v); + return inode; } - - return inode; } #define memalloc_flags_do(_flags, _do) \ @@ -241,7 +293,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) { - struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, + bch2_inode_cache, GFP_NOFS); if (!inode) return NULL; @@ -283,13 +336,24 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) return inode; } +static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) +{ + struct bch_inode_info *inode = bch2_new_inode(trans); + if (IS_ERR(inode)) + return inode; + + bch2_vfs_inode_init(trans, inum, inode, bi, subvol); + + return bch2_inode_hash_insert(trans->c, trans, inode); + +} + struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { - struct bch_inode_info *inode = - to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); if (inode) return &inode->v; @@ -300,11 +364,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) int ret = lockrestart_do(trans, bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: - PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); - if (!ret) { - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - inode = bch2_inode_insert(c, inode); - } + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); return ret ? ERR_PTR(ret) : &inode->v; @@ -325,6 +385,8 @@ __bch2_create(struct mnt_idmap *idmap, subvol_inum inum; struct bch_subvolume subvol; u64 journal_seq = 0; + kuid_t kuid; + kgid_t kgid; int ret; /* @@ -351,13 +413,15 @@ __bch2_create(struct mnt_idmap *idmap, retry: bch2_trans_begin(trans); - ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?: + kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); + kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); + ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, - from_kuid(i_user_ns(&dir->v), current_fsuid()), - from_kgid(i_user_ns(&dir->v), current_fsgid()), + from_kuid(i_user_ns(&dir->v), kuid), + from_kgid(i_user_ns(&dir->v), kgid), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, @@ -365,7 +429,7 @@ retry: if (unlikely(ret)) goto err_before_quota; - inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, @@ -395,8 +459,16 @@ err_before_quota: * we must insert the new inode into the inode cache before calling * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: + * + * also, calling bch2_inode_hash_insert() without passing in the + * transaction object is sketchy - if we could ever end up in + * __wait_on_freeing_inode(), we'd risk deadlock. + * + * But that shouldn't be possible, since we still have the inode locked + * that we just created, and we _really_ can't take a transaction + * restart here. */ - inode = bch2_inode_insert(c, inode); + inode = bch2_inode_hash_insert(c, NULL, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -436,11 +508,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) goto err; - struct bch_inode_info *inode = - to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); if (inode) goto out; @@ -448,7 +516,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: - PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", @@ -468,9 +536,6 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, ret = -ENOENT; goto err; } - - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - inode = bch2_inode_insert(c, inode); out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); @@ -557,8 +622,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: - bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: + bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return bch2_err_class(ret); @@ -614,7 +679,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: __bch2_unlink(vdir, dentry, false); return bch2_err_class(ret); } @@ -671,15 +736,16 @@ static int bch2_rename2(struct mnt_idmap *idmap, struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; + bool whiteout = !!(flags & RENAME_WHITEOUT); int ret; - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) return -EINVAL; if (mode == BCH_RENAME_OVERWRITE) { @@ -697,8 +763,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, trans = bch2_trans_get(c); - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?: - bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol); + ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: + bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) goto err; @@ -720,18 +786,48 @@ static int bch2_rename2(struct mnt_idmap *idmap, if (ret) goto err; } +retry: + bch2_trans_begin(trans); - ret = commit_do(trans, NULL, NULL, 0, - bch2_rename_trans(trans, - inode_inum(src_dir), &src_dir_u, - inode_inum(dst_dir), &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode)); + ret = bch2_rename_trans(trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode); if (unlikely(ret)) + goto err_tx_restart; + + if (whiteout) { + whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); + ret = PTR_ERR_OR_ZERO(whiteout_inode_u); + if (unlikely(ret)) + goto err_tx_restart; + bch2_inode_init_early(c, whiteout_inode_u); + + ret = bch2_create_trans(trans, + inode_inum(src_dir), &src_dir_u, + whiteout_inode_u, + &src_dentry->d_name, + from_kuid(i_user_ns(&src_dir->v), current_fsuid()), + from_kgid(i_user_ns(&src_dir->v), current_fsgid()), + S_IFCHR|WHITEOUT_MODE, 0, + NULL, NULL, (subvol_inum) { 0 }, 0) ?: + bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_tx_restart; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (unlikely(ret)) { +err_tx_restart: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; goto err; + } BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); BUG_ON(dst_inode && @@ -779,11 +875,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, { struct bch_fs *c = inode->v.i_sb->s_fs_info; unsigned int ia_valid = attr->ia_valid; + kuid_t kuid; + kgid_t kgid; - if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); - if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); + } + if (ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); + } if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; @@ -798,11 +900,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; kgid_t gid = ia_valid & ATTR_GID - ? attr->ia_gid + ? kgid : inode->v.i_gid; - if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) + if (!in_group_or_capable(idmap, &inode->v, + make_vfsgid(idmap, i_user_ns(&inode->v), gid))) mode &= ~S_ISGID; bi->bi_mode = mode; } @@ -818,17 +920,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; + kuid_t kuid; + kgid_t kgid; int ret; mutex_lock(&inode->ei_update_lock); qid = inode->ei_qid; - if (attr->ia_valid & ATTR_UID) - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + if (attr->ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); + } - if (attr->ia_valid & ATTR_GID) - qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (attr->ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); + } ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); @@ -884,13 +992,15 @@ static int bch2_getattr(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); stat->dev = inode->v.i_sb->s_dev; stat->ino = inode->v.i_ino; stat->mode = inode->v.i_mode; stat->nlink = inode->v.i_nlink; - stat->uid = inode->v.i_uid; - stat->gid = inode->v.i_gid; + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); stat->atime = inode_get_atime(&inode->v); @@ -899,7 +1009,7 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; - stat->subvol = inode->ei_subvol; + stat->subvol = inode->ei_inum.subvol; stat->result_mask |= STATX_SUBVOL; if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { @@ -941,7 +1051,7 @@ static int bch2_setattr(struct mnt_idmap *idmap, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: setattr_prepare(idmap, dentry, iattr); if (ret) return ret; @@ -1034,7 +1144,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct bkey_buf cur, prev; unsigned offset_into_extent, sectors; bool have_extent = false; - u32 snapshot; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); @@ -1050,21 +1159,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); - if (ret) - goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(ei->v.i_ino, start, snapshot), 0); + POS(ei->v.i_ino, start), 0); - while (!(ret = btree_trans_too_many_iters(trans)) && - (k = bch2_btree_iter_peek_upto(&iter, end)).k && - !(ret = bkey_err(k))) { + while (true) { enum btree_id data_btree = BTREE_ID_extents; + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_upto(&iter, end); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k) + break; + if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { bch2_btree_iter_advance(&iter); @@ -1108,16 +1226,12 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); - - ret = bch2_trans_relock(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } - start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; if (!ret && have_extent) { bch2_trans_unlock(trans); @@ -1173,7 +1287,7 @@ static int bch2_open(struct inode *vinode, struct file *file) struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, inode->ei_subvol); + int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); if (ret) return ret; } @@ -1305,8 +1419,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type) static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) { return (struct bcachefs_fid) { - .inum = inode->ei_inode.bi_inum, - .subvol = inode->ei_subvol, + .inum = inode->ei_inum.inum, + .subvol = inode->ei_inum.subvol, .gen = inode->ei_inode.bi_generation, }; } @@ -1391,7 +1505,7 @@ static struct dentry *bch2_get_parent(struct dentry *child) struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum parent_inum = { .subvol = inode->ei_inode.bi_parent_subvol ?: - inode->ei_subvol, + inode->ei_inum.subvol, .inum = inode->ei_inode.bi_dir, }; @@ -1427,7 +1541,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child retry: bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); if (ret) goto err; @@ -1458,8 +1572,7 @@ retry: if (ret) goto err; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } else { /* @@ -1480,8 +1593,7 @@ retry: if (ret) continue; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } } @@ -1513,12 +1625,15 @@ static const struct export_operations bch_export_ops = { .get_name = bch2_get_name, }; -static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, +static void bch2_vfs_inode_init(struct btree_trans *trans, + subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { - bch2_iget5_set(&inode->v, &inum); + inode->v.i_ino = inum.inum; + inode->ei_inum = inum; + inode->ei_inode.bi_inum = inum.inum; bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; @@ -1530,7 +1645,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_flags = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); - inode->ei_subvol = inum.subvol; if (BCH_SUBVOLUME_SNAP(subvol)) set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); @@ -1597,6 +1711,17 @@ static void bch2_evict_inode(struct inode *vinode) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); + bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); + + /* + * evict() has waited for outstanding writeback, we'll do no more IO + * through this inode: it's safe to remove from VFS inode hashtable here + * + * Do that now so that other threads aren't blocked from pulling it back + * in, there's no reason for them to be: + */ + if (!delete) + bch2_inode_hash_remove(c, inode); truncate_inode_pages_final(&inode->v.i_data); @@ -1604,12 +1729,18 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { + if (delete) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); + + /* + * If we are deleting, we need it present in the vfs hash table + * so that fsck can check if unlinked inodes are still open: + */ + bch2_inode_hash_remove(c, inode); } mutex_lock(&c->vfs_inodes_lock); @@ -1639,7 +1770,7 @@ again: mutex_lock(&c->vfs_inodes_lock); list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { - if (!snapshot_list_has_id(s, inode->ei_subvol)) + if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) continue; if (!(inode->v.i_state & I_DONTCACHE) && @@ -1801,30 +1932,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - enum bch_opt_id i; struct printbuf buf = PRINTBUF; - int ret = 0; - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); + bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, + OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); + printbuf_nul_terminate(&buf); + seq_puts(seq, buf.buf); - if ((opt->flags & OPT_HIDDEN) || - !(opt->flags & OPT_MOUNT)) - continue; - - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - printbuf_reset(&buf); - bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, - OPT_SHOW_MOUNT_STYLE); - seq_putc(seq, ','); - seq_puts(seq, buf.buf); - } - - if (buf.allocation_failure) - ret = -ENOMEM; + int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); return ret; } @@ -2129,12 +2244,23 @@ static int bch2_init_fs_context(struct fs_context *fc) return 0; } +void bch2_fs_vfs_exit(struct bch_fs *c) +{ + if (c->vfs_inodes_table.tbl) + rhashtable_destroy(&c->vfs_inodes_table); +} + +int bch2_fs_vfs_init(struct bch_fs *c) +{ + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); +} + static struct file_system_type bcache_fs_type = { .owner = THIS_MODULE, .name = "bcachefs", .init_fs_context = bch2_init_fs_context, .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("bcachefs"); @@ -2149,7 +2275,8 @@ int __init bch2_vfs_init(void) { int ret = -ENOMEM; - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT); if (!bch2_inode_cache) goto err; |